http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index 5b3c687..7a70f9d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,9 @@ build/ runtime/ logs/ /bin/ + +*.class +target/ +nutch-core/target +nutch-plugins/target +nutch-plugins/*/target \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/bin/crawl ---------------------------------------------------------------------- diff --git a/bin/crawl b/bin/crawl new file mode 100755 index 0000000..567d35e --- /dev/null +++ b/bin/crawl @@ -0,0 +1,281 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds> +# -i|--index Indexes crawl results into a configured indexer +# -w|--wait NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs +# are scheduled for fetching. Suffix can be: s for second, +# m for minute, h for hour and d for day. If no suffix is +# specified second is used by default. +# -D A Java property to pass to Nutch calls +# Seed Dir Directory in which to look for a seeds file +# Crawl Dir Directory where the crawl/link/segments dirs are saved +# Num Rounds The number of rounds to run this crawl for +# +# +# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND +# INDEXING FOR EACH SEGMENT + +INDEXFLAG=false +JAVA_PROPERTIES="" +WAIT=-1 # don't wait if there are no URLs to fetch + +function __to_seconds() { + NUMBER=$(echo $1 | tr -dc '0-9') + MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]') + + case $MODIFIER in + m|M) + SECONDS=`expr $NUMBER \* 60` + ;; + h|H) + SECONDS=`expr $NUMBER \* 120` + ;; + d|D) + SECONDS=`expr $NUMBER \* 86400` + ;; + s|S|*) + SECONDS=$NUMBER + ;; + esac + + echo $SECONDS +} + +while [[ $# > 0 ]] +do + case $1 in + -i|--index) + INDEXFLAG=true + shift + ;; + -D) + JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}" + shift 2 + ;; + -w|--wait) + WAIT="${2}" + shift 2 + ;; + *) + break + ;; + esac +done + +if [[ $# != 3 ]]; then + echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>" + echo -e "\t-i|--index\tIndexes crawl results into a configured indexer" + echo -e "\t-D\t\tA Java property to pass to Nutch calls" + echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs" + echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second," + echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is" + echo -e "\t\t\tspecified second is used by default." + echo -e "\tSeed Dir\tDirectory in which to look for a seeds file" + echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved" + echo -e "\tNum Rounds\tThe number of rounds to run this crawl for" + exit 1 +fi + +SEEDDIR="$1" +CRAWL_PATH="$2" +LIMIT="$3" + +# convert wait time to seconds for compatibility reasons +if [ "$WAIT" != "-1" ]; then + WAIT=$( __to_seconds "$WAIT" ) + echo "Time to wait (--wait) = $WAIT sec." +fi + +############################################# +# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS # +############################################# + +# set the number of slaves nodes +numSlaves=1 + +# and the total number of available tasks +# sets Hadoop parameter "mapreduce.job.reduces" +numTasks=`expr $numSlaves \* 2` + +# number of urls to fetch in one iteration +# 250K per task? +sizeFetchlist=`expr $numSlaves \* 50000` + +# time limit for feching +timeLimitFetch=180 + +# num threads for fetching +numThreads=50 + +############################################# + +bin="`dirname "$0"`" +bin="`cd "$bin"; pwd`" + +# determines whether mode based on presence of job file +mode=local +if [ -f "${bin}"/../*nutch*.job ]; then + mode=distributed +fi + +# note that some of the options listed here could be set in the +# corresponding hadoop site xml param file +commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true" + + # check that hadoop can be found on the path +if [ $mode = "distributed" ]; then + if [ $(which hadoop | wc -l ) -eq 0 ]; then + echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode." + exit -1; + fi +fi + + +function __bin_nutch { + # run $bin/nutch, exit if exit value indicates error + + echo "$bin/nutch $@" ;# echo command and arguments + "$bin/nutch" "$@" + + RETCODE=$? + if [ $RETCODE -ne 0 ] + then + echo "Error running:" + echo " $bin/nutch $@" + echo "Failed with exit value $RETCODE." + exit $RETCODE + fi +} + + + +# initial injection +echo "Injecting seed URLs" +__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR" + +# main loop : rounds of generate - fetch - parse - update +for ((a=1; ; a++)) +do + if [ -e ".STOP" ] + then + echo "STOP file found - escaping loop" + break + fi + + if [ $LIMIT -ne -1 ]; then + if [ $a -gt $LIMIT ]; then + echo `date` ": Finished loop with $LIMIT iterations" + break + fi + echo `date` ": Iteration $a of $LIMIT" + else + echo `date` ": Iteration $a" + fi + + echo "Generating a new segment" + generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter) + echo "$bin/nutch generate ${generate_args[@]}" + $bin/nutch generate "${generate_args[@]}" + RETCODE=$? + if [ $RETCODE -eq 0 ]; then + : # ok: no error + elif [ $RETCODE -eq 1 ]; then + echo "Generate returned 1 (no new segments created)" + + if [ "$WAIT" -ne -1 ]; then + echo "Waiting for $WAIT sec. ..." + sleep $WAIT + continue + else + echo "Escaping loop: no more URLs to fetch now" + break + fi + else + echo "Error running:" + echo " $bin/nutch generate ${generate_args[@]}" + echo "Failed with exit value $RETCODE." + exit $RETCODE + fi + + # capture the name of the segment + # call hadoop in distributed mode + # or use ls + + if [ $mode = "local" ]; then + SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1` + else + SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments | sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1` + fi + + echo "Operating on segment : $SEGMENT" + + # fetching the segment + echo "Fetching : $SEGMENT" + __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads + + # parsing the segment + echo "Parsing : $SEGMENT" + # enable the skipping of records for the parsing so that a dodgy document + # so that it does not fail the full task + skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1" + __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT + + # updatedb with this segment + echo "CrawlDB update" + __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT + +# note that the link inversion - indexing routine can be done within the main loop +# on a per segment basis + echo "Link inversion" + __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT + + echo "Dedup on crawldb" + __bin_nutch dedup "$CRAWL_PATH"/crawldb + + if $INDEXFLAG; then + echo "Indexing $SEGMENT to index" + __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT + + echo "Cleaning up index if possible" + __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb + else + echo "Skipping indexing ..." + fi + + ####################################################### + # The following commands fall into WebGraph territory + # and should be uncommented based on your requirements + ####################################################### + #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/" + #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH" + + #echo "Running Loops Job on WebGraph within $CRAWL_PATH" + #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH" + + #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH" + #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH" + + #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH" + #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH" + + #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores" + #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores + +done + +exit 0 http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/bin/nutch ---------------------------------------------------------------------- diff --git a/bin/nutch b/bin/nutch new file mode 100755 index 0000000..1649069 --- /dev/null +++ b/bin/nutch @@ -0,0 +1,324 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# The Nutch command script +# +# Environment Variables +# +# NUTCH_JAVA_HOME The java implementation to use. Overrides JAVA_HOME. +# +# NUTCH_HEAPSIZE The maximum amount of heap to use, in MB. +# Default is 1000. +# +# NUTCH_OPTS Extra Java runtime options. +# Multiple options must be separated by white space. +# +# NUTCH_LOG_DIR Log directory (default: $NUTCH_HOME/logs) +# +# NUTCH_LOGFILE Log file (default: hadoop.log) +# +# NUTCH_CONF_DIR Path(s) to configuration files (default: $NUTCH_HOME/conf). +# Multiple paths must be separated by a colon ':'. +# +cygwin=false +case "`uname`" in +CYGWIN*) cygwin=true;; +esac + +# resolve links - $0 may be a softlink +THIS="$0" +while [ -h "$THIS" ]; do + ls=`ls -ld "$THIS"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + THIS="$link" + else + THIS=`dirname "$THIS"`/"$link" + fi +done + +# if no args specified, show usage +if [ $# = 0 ]; then + echo "nutch 1.12" + echo "Usage: nutch COMMAND" + echo "where COMMAND is one of:" + echo " readdb read / dump crawl db" + echo " mergedb merge crawldb-s, with optional filtering" + echo " readlinkdb read / dump link db" + echo " inject inject new urls into the database" + echo " generate generate new segments to fetch from crawl db" + echo " freegen generate new segments to fetch from text files" + echo " fetch fetch a segment's pages" + echo " parse parse a segment's pages" + echo " readseg read / dump segment data" + echo " mergesegs merge several segments, with optional filtering and slicing" + echo " updatedb update crawl db from segments after fetching" + echo " invertlinks create a linkdb from parsed segments" + echo " mergelinkdb merge linkdb-s, with optional filtering" + echo " index run the plugin-based indexer on parsed segments and linkdb" + echo " dedup deduplicate entries in the crawldb and give them a special status" + echo " dump exports crawled data from segments into files" + echo " commoncrawldump exports crawled data from segments into common crawl data format encoded as CBOR" + echo " solrindex run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead" + echo " solrdedup remove duplicates from solr - DEPRECATED use the dedup command instead" + echo " solrclean remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead" + echo " clean remove HTTP 301 and 404 documents and duplicates from indexing backends configured via plugins" + echo " parsechecker check the parser for a given url" + echo " indexchecker check the indexing filters for a given url" + echo " filterchecker check url filters for a given url" + echo " normalizerchecker check url normalizers for a given url" + echo " domainstats calculate domain statistics from crawldb" + echo " protocolstats calculate protocol status code stats from crawldb" + echo " crawlcomplete calculate crawl completion stats from crawldb" + echo " webgraph generate a web graph from existing segments" + echo " linkrank run a link analysis program on the generated web graph" + echo " scoreupdater updates the crawldb with linkrank scores" + echo " nodedumper dumps the web graph's node scores" + echo " plugin load a plugin and run one of its classes main()" + echo " junit runs the given JUnit test" + echo " startserver runs the Nutch Server on localhost:8081" + echo " webapp run a local Nutch Web Application on locahost:8080" + echo " warc exports crawled data from segments at the WARC format" + echo " updatehostdb update the host db with records from the crawl db" + echo " readhostdb read / dump host db" + echo " or" + echo " CLASSNAME run the class named CLASSNAME" + echo "Most commands print help when invoked w/o parameters." + exit 1 +fi + +# get arguments +COMMAND=$1 +shift + +# some directories +THIS_DIR="`dirname "$THIS"`" +NUTCH_HOME="`cd "$THIS_DIR/.." ; pwd`" + +# some Java parameters +if [ "$NUTCH_JAVA_HOME" != "" ]; then + #echo "run java in $NUTCH_JAVA_HOME" + JAVA_HOME="$NUTCH_JAVA_HOME" +fi + +if [ "$JAVA_HOME" = "" ]; then + echo "Error: JAVA_HOME is not set." + exit 1 +fi + +local=true + +# NUTCH_JOB +if [ -f "${NUTCH_HOME}"/*nutch*.job ]; then + local=false + for f in "$NUTCH_HOME"/*nutch*.job; do + NUTCH_JOB="$f" + done + # cygwin path translation + if $cygwin; then + NUTCH_JOB="`cygpath -p -w "$NUTCH_JOB"`" + fi +fi + +JAVA="$JAVA_HOME/bin/java" +JAVA_HEAP_MAX=-Xmx1000m + +# check envvars which might override default args +if [ "$NUTCH_HEAPSIZE" != "" ]; then + #echo "run with heapsize $NUTCH_HEAPSIZE" + JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m" + #echo $JAVA_HEAP_MAX +fi + +# CLASSPATH initially contains $NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf +CLASSPATH="${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}" +CLASSPATH="${CLASSPATH}:$JAVA_HOME/lib/tools.jar" + +# so that filenames w/ spaces are handled correctly in loops below +IFS= + +# add libs to CLASSPATH +if $local; then + for f in "$NUTCH_HOME"/lib/*.jar; do + CLASSPATH="${CLASSPATH}:$f"; + done + # local runtime + # add plugins to classpath + if [ -d "$NUTCH_HOME/plugins" ]; then + CLASSPATH="${NUTCH_HOME}:${CLASSPATH}" + fi +fi + +# cygwin path translation +if $cygwin; then + CLASSPATH="`cygpath -p -w "$CLASSPATH"`" +fi + +# setup 'java.library.path' for native-hadoop code if necessary +# used only in local mode +JAVA_LIBRARY_PATH='' +if [ -d "${NUTCH_HOME}/lib/native" ]; then + + JAVA_PLATFORM=`"${JAVA}" -classpath "$CLASSPATH" org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g'` + + if [ -d "${NUTCH_HOME}/lib/native" ]; then + if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then + JAVA_LIBRARY_PATH="${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}" + else + JAVA_LIBRARY_PATH="${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}" + fi + fi +fi + +if [ $cygwin = true -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then + JAVA_LIBRARY_PATH="`cygpath -p -w "$JAVA_LIBRARY_PATH"`" +fi + +# restore ordinary behaviour +unset IFS + +# default log directory & file +if [ "$NUTCH_LOG_DIR" = "" ]; then + NUTCH_LOG_DIR="$NUTCH_HOME/logs" +fi +if [ "$NUTCH_LOGFILE" = "" ]; then + NUTCH_LOGFILE='hadoop.log' +fi + +#Fix log path under cygwin +if $cygwin; then + NUTCH_LOG_DIR="`cygpath -p -w "$NUTCH_LOG_DIR"`" +fi + +NUTCH_OPTS=($NUTCH_OPTS -Dhadoop.log.dir="$NUTCH_LOG_DIR") +NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Dhadoop.log.file="$NUTCH_LOGFILE") + +if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then + NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Djava.library.path="$JAVA_LIBRARY_PATH") +fi + +# figure out which class to run +if [ "$COMMAND" = "crawl" ] ; then + echo "Command $COMMAND is deprecated, please use bin/crawl instead" + exit -1 +elif [ "$COMMAND" = "inject" ] ; then + CLASS=org.apache.nutch.crawl.Injector +elif [ "$COMMAND" = "generate" ] ; then + CLASS=org.apache.nutch.crawl.Generator +elif [ "$COMMAND" = "freegen" ] ; then + CLASS=org.apache.nutch.tools.FreeGenerator +elif [ "$COMMAND" = "fetch" ] ; then + CLASS=org.apache.nutch.fetcher.Fetcher +elif [ "$COMMAND" = "parse" ] ; then + CLASS=org.apache.nutch.parse.ParseSegment +elif [ "$COMMAND" = "readdb" ] ; then + CLASS=org.apache.nutch.crawl.CrawlDbReader +elif [ "$COMMAND" = "mergedb" ] ; then + CLASS=org.apache.nutch.crawl.CrawlDbMerger +elif [ "$COMMAND" = "readlinkdb" ] ; then + CLASS=org.apache.nutch.crawl.LinkDbReader +elif [ "$COMMAND" = "readseg" ] ; then + CLASS=org.apache.nutch.segment.SegmentReader +elif [ "$COMMAND" = "mergesegs" ] ; then + CLASS=org.apache.nutch.segment.SegmentMerger +elif [ "$COMMAND" = "updatedb" ] ; then + CLASS=org.apache.nutch.crawl.CrawlDb +elif [ "$COMMAND" = "invertlinks" ] ; then + CLASS=org.apache.nutch.crawl.LinkDb +elif [ "$COMMAND" = "mergelinkdb" ] ; then + CLASS=org.apache.nutch.crawl.LinkDbMerger +elif [ "$COMMAND" = "dump" ] ; then + CLASS=org.apache.nutch.tools.FileDumper +elif [ "$COMMAND" = "commoncrawldump" ] ; then + CLASS=org.apache.nutch.tools.CommonCrawlDataDumper +elif [ "$COMMAND" = "solrindex" ] ; then + CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1" + shift +elif [ "$COMMAND" = "index" ] ; then + CLASS=org.apache.nutch.indexer.IndexingJob +elif [ "$COMMAND" = "solrdedup" ] ; then + echo "Command $COMMAND is deprecated, please use dedup instead" + exit -1 +elif [ "$COMMAND" = "dedup" ] ; then + CLASS=org.apache.nutch.crawl.DeduplicationJob +elif [ "$COMMAND" = "solrclean" ] ; then + CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2 $1" + shift; shift +elif [ "$COMMAND" = "clean" ] ; then + CLASS=org.apache.nutch.indexer.CleaningJob +elif [ "$COMMAND" = "parsechecker" ] ; then + CLASS=org.apache.nutch.parse.ParserChecker +elif [ "$COMMAND" = "indexchecker" ] ; then + CLASS=org.apache.nutch.indexer.IndexingFiltersChecker +elif [ "$COMMAND" = "filterchecker" ] ; then + CLASS=org.apache.nutch.net.URLFilterChecker +elif [ "$COMMAND" = "normalizerchecker" ] ; then + CLASS=org.apache.nutch.net.URLNormalizerChecker +elif [ "$COMMAND" = "domainstats" ] ; then + CLASS=org.apache.nutch.util.domain.DomainStatistics +elif [ "$COMMAND" = "protocolstats" ] ; then + CLASS=org.apache.nutch.util.ProtocolStatusStatistics +elif [ "$COMMAND" = "crawlcomplete" ] ; then + CLASS=org.apache.nutch.util.CrawlCompletionStats +elif [ "$COMMAND" = "webgraph" ] ; then + CLASS=org.apache.nutch.scoring.webgraph.WebGraph +elif [ "$COMMAND" = "linkrank" ] ; then + CLASS=org.apache.nutch.scoring.webgraph.LinkRank +elif [ "$COMMAND" = "scoreupdater" ] ; then + CLASS=org.apache.nutch.scoring.webgraph.ScoreUpdater +elif [ "$COMMAND" = "nodedumper" ] ; then + CLASS=org.apache.nutch.scoring.webgraph.NodeDumper +elif [ "$COMMAND" = "plugin" ] ; then + CLASS=org.apache.nutch.plugin.PluginRepository +elif [ "$COMMAND" = "junit" ] ; then + CLASSPATH="$CLASSPATH:$NUTCH_HOME/test/classes/" + if $local; then + for f in "$NUTCH_HOME"/test/lib/*.jar; do + CLASSPATH="${CLASSPATH}:$f"; + done + fi + CLASS=org.junit.runner.JUnitCore +elif [ "$COMMAND" = "startserver" ] ; then + CLASS=org.apache.nutch.service.NutchServer +elif [ "$COMMAND" = "webapp" ] ; then + CLASS=org.apache.nutch.webui.NutchUiServer +elif [ "$COMMAND" = "warc" ] ; then + CLASS=org.apache.nutch.tools.warc.WARCExporter +elif [ "$COMMAND" = "updatehostdb" ] ; then + CLASS=org.apache.nutch.hostdb.UpdateHostDb +elif [ "$COMMAND" = "readhostdb" ] ; then + CLASS=org.apache.nutch.hostdb.ReadHostDb +else + CLASS=$COMMAND +fi + +# distributed mode +EXEC_CALL=(hadoop jar "$NUTCH_JOB") + +if $local; then + EXEC_CALL=("$JAVA" $JAVA_HEAP_MAX "${NUTCH_OPTS[@]}" -classpath "$CLASSPATH") +else + # check that hadoop can be found on the path + if [ $(which hadoop | wc -l ) -eq 0 ]; then + echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode." + exit -1; + fi +fi + +# run it +exec "${EXEC_CALL[@]}" $CLASS "$@" + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-core/pom.xml b/nutch-core/pom.xml new file mode 100644 index 0000000..62e2e58 --- /dev/null +++ b/nutch-core/pom.xml @@ -0,0 +1,522 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-parent</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>nutch-core</artifactId> + <packaging>jar</packaging> + + <name>Apache Nutch</name> + <description>Nutch is an open source web-search software. + It builds on Hadoop, Tika and Solr, adding web-specifics, + such as a crawler, a link-graph database etc. + </description> + <url>http://nutch.apache.org</url> + <licenses> + <license> + <name>The Apache Software License, Version 2.0</name> + <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url> + <distribution>repo</distribution> + </license> + </licenses> + + <scm> + <developerConnection>scm:git:https://git-wip-us.apache.org/repos/asf/nutch.git</developerConnection> + <connection>scm:git:http://git-wip-us.apache.org/repos/asf/nutch.git</connection> + <url>https://git-wip-us.apache.org/repos/asf/nutch.git</url> + </scm> + + <pluginRepositories> + <pluginRepository> + <id>miredot</id> + <name>MireDot Releases</name> + <url>http://nexus.qmino.com/content/repositories/miredot</url> + </pluginRepository> + </pluginRepositories> + + <developers> + <developer> + <id>mattmann</id> + <name>Chris A. Mattmann</name> + <email>[email protected]</email> + </developer> + <developer> + <id>jnioche</id> + <name>Julien Nioche</name> + <email>[email protected]</email> + </developer> + <developer> + <id>lewismc</id> + <name>Lewis John McGibbney</name> + <email>[email protected]</email> + </developer> + <developer> + <id>markus</id> + <name>Markus Jelsma</name> + <email>[email protected]</email> + </developer> + <developer> + <id>fenglu</id> + <name>Feng Lu</name> + <email>[email protected]</email> + </developer> + <developer> + <id>kiranch</id> + <name>Kiran Chitturi</name> + <email>[email protected]</email> + </developer> + <developer> + <id>tejasp</id> + <name>Tejas Patil</name> + <email>[email protected]</email> + </developer> + <developer> + <id>talat</id> + <name>Talat Uyarer</name> + <email>[email protected]</email> + </developer> + <developer> + <id>snagel</id> + <name>Sebastian Nagel</name> + <email>[email protected]</email> + </developer> + <developer> + <id>thammegowda</id> + <name>Thamme Gowda</name> + <email>[email protected]</email> + </developer> + </developers> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <slf4j.version>1.7.12</slf4j.version> + <junit.version>4.12</junit.version> + <dir.root>${project.parent.basedir}</dir.root> + <libs.dir>${dir.local}${file.separator}lib</libs.dir> + </properties> + + <dependencies> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api</artifactId> + <version>${slf4j.version}</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + <version>${slf4j.version}</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>commons-lang</groupId> + <artifactId>commons-lang</artifactId> + <version>2.6</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>commons-collections</groupId> + <artifactId>commons-collections</artifactId> + <version>3.2.1</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>commons-httpclient</groupId> + <artifactId>commons-httpclient</artifactId> + <version>3.1</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>commons-codec</groupId> + <artifactId>commons-codec</artifactId> + <version>1.10</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-compress</artifactId> + <version>1.9</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-jexl</artifactId> + <version>2.1.1</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>com.tdunning</groupId> + <artifactId>t-digest</artifactId> + <version>3.1</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-common</artifactId> + <version>2.4.0</version> + <optional>true</optional> + <exclusions> + <exclusion> + <groupId>hsqldb</groupId> + <artifactId>*</artifactId> + </exclusion> + <exclusion> + <groupId>net.sf.kosmosfs</groupId> + <artifactId>*</artifactId> + </exclusion> + <exclusion> + <groupId>net.java.dev.jets3t</groupId> + <artifactId>*</artifactId> + </exclusion> + <exclusion> + <groupId>org.eclipse.jdt</groupId> + <artifactId>*</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>*</artifactId> + </exclusion> + <exclusion> + <groupId>ant</groupId> + <artifactId>*</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-hdfs</artifactId> + <version>2.4.0</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-mapreduce-client-core</artifactId> + <version>2.4.0</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-mapreduce-client-jobclient</artifactId> + <version>2.4.0</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-core</artifactId> + <version>1.12</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>com.ibm.icu</groupId> + <artifactId>icu4j</artifactId> + <version>55.1</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>xerces</groupId> + <artifactId>xercesImpl</artifactId> + <version>2.11.0</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>xerces</groupId> + <artifactId>xmlParserAPIs</artifactId> + <version>2.6.2</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>oro</groupId> + <artifactId>oro</artifactId> + <version>2.0.8</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>16.0.1</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>com.github.crawler-commons</groupId> + <artifactId>crawler-commons</artifactId> + <version>0.6</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>com.martinkl.warc</groupId> + <artifactId>warc-hadoop</artifactId> + <version>0.1.0</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>org.apache.cxf</groupId> + <artifactId>cxf-rt-frontend-jaxws</artifactId> + <version>3.0.4</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.cxf</groupId> + <artifactId>cxf-rt-frontend-jaxrs</artifactId> + <version>3.0.4</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.cxf</groupId> + <artifactId>cxf-rt-transports-http</artifactId> + <version>3.0.4</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.cxf</groupId> + <artifactId>cxf-rt-transports-http-jetty</artifactId> + <version>3.0.4</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.cxf</groupId> + <artifactId>cxf-rt-rs-client</artifactId> + <version>3.0.4</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + <version>2.5.1</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.dataformat</groupId> + <artifactId>jackson-dataformat-cbor</artifactId> + <version>2.5.1</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.jaxrs</groupId> + <artifactId>jackson-jaxrs-json-provider</artifactId> + <version>2.5.1</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-common</artifactId> + <version>4.10.2</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.netpreserve.commons</groupId> + <artifactId>webarchive-commons</artifactId> + <version>1.1.5</version> + <optional>true</optional> + <exclusions> + <exclusion> + <groupId>*</groupId> + <artifactId>hadoop-core</artifactId> + </exclusion> + <exclusion> + <groupId>com.google.guava</groupId> + <artifactId>*</artifactId> + </exclusion> + <exclusion> + <groupId>junit</groupId> + <artifactId>*</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>4.11</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.mrunit</groupId> + <artifactId>mrunit</artifactId> + <version>1.1.0</version> + <classifier>hadoop2</classifier> + <optional>true</optional> + <exclusions> + <exclusion> + <groupId>log4j</groupId> + <artifactId>log4j</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty-client</artifactId> + <version>6.1.22</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty</artifactId> + <version>6.1.22</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty-util</artifactId> + <version>6.1.22</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-collections4</artifactId> + <version>4.0</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.springframework</groupId> + <artifactId>spring-core</artifactId> + <version>4.0.4.RELEASE</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.springframework</groupId> + <artifactId>spring-context</artifactId> + <version>4.0.4.RELEASE</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.springframework</groupId> + <artifactId>spring-web</artifactId> + <version>4.0.4.RELEASE</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>com.sun.jersey</groupId> + <artifactId>jersey-client</artifactId> + <version>1.8</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>com.j256.ormlite</groupId> + <artifactId>ormlite-jdbc</artifactId> + <version>4.48</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>com.h2database</groupId> + <artifactId>h2</artifactId> + <version>1.4.180</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.eclipse.persistence</groupId> + <artifactId>javax.persistence</artifactId> + <version>2.0.0</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.wicket</groupId> + <artifactId>wicket-core</artifactId> + <version>6.16.0</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.apache.wicket</groupId> + <artifactId>wicket-spring</artifactId> + <version>6.16.0</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>de.agilecoders.wicket</groupId> + <artifactId>wicket-bootstrap-core</artifactId> + <version>0.9.2</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>de.agilecoders.wicket</groupId> + <artifactId>wicket-bootstrap-extensions</artifactId> + <version>0.9.2</version> + <optional>true</optional> + </dependency> + </dependencies> + + <build> + <resources> + <resource> + <directory>${project.parent.basedir}${file.separator}conf</directory> + </resource> + </resources> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <version>2.6</version> + <executions> + <execution> + <goals> + <goal>test-jar</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <artifactId>maven-resources-plugin</artifactId> + <version>3.0.1</version> + <executions> + <execution> + <id>copy-resources</id> + <phase>package</phase> + <goals> + <goal>copy-resources</goal> + </goals> + <configuration> + <outputDirectory>${libs.dir}</outputDirectory> + <resources> + <resource> + <directory>${project.build.directory}</directory> + <include>${build.finalName}.jar</include> + </resource> + <resource> + <directory>${project.basedir}</directory> + <include>plugin.xml</include> + </resource> + </resources> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <artifactId>maven-surefire-plugin</artifactId> + <version>2.19.1</version> + <configuration> + <excludedGroups>org.apache.nutch.test.IntegrationTest</excludedGroups> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-failsafe-plugin</artifactId> + <version>2.19.1</version> + <configuration> + <systemPropertyVariables> + <plugin.folders>../runtime/local/plugins</plugin.folders> + </systemPropertyVariables> + </configuration> + </plugin> + </plugins> + </build> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java b/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java new file mode 100755 index 0000000..c259419 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java @@ -0,0 +1,227 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; + +/** + * This class provides common methods for implementations of + * <code>FetchSchedule</code>. + * + * @author Andrzej Bialecki + */ +public abstract class AbstractFetchSchedule extends Configured implements + FetchSchedule { + private static final Logger LOG = LoggerFactory + .getLogger(AbstractFetchSchedule.class); + + protected int defaultInterval; + protected int maxInterval; + + public AbstractFetchSchedule() { + super(null); + } + + public AbstractFetchSchedule(Configuration conf) { + super(conf); + } + + public void setConf(Configuration conf) { + super.setConf(conf); + if (conf == null) + return; + defaultInterval = conf.getInt("db.fetch.interval.default", 0); + maxInterval = conf.getInt("db.fetch.interval.max", 0); + LOG.info("defaultInterval=" + defaultInterval); + LOG.info("maxInterval=" + maxInterval); + } + + /** + * Initialize fetch schedule related data. Implementations should at least set + * the <code>fetchTime</code> and <code>fetchInterval</code>. The default + * implementation sets the <code>fetchTime</code> to now, using the default + * <code>fetchInterval</code>. + * + * @param url + * URL of the page. + * + * @param datum + * datum instance to be initialized (modified in place). + */ + public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) { + datum.setFetchTime(System.currentTimeMillis()); + datum.setFetchInterval(defaultInterval); + datum.setRetriesSinceFetch(0); + return datum; + } + + /** + * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a + * successfully fetched page. NOTE: this implementation resets the retry + * counter - extending classes should call super.setFetchSchedule() to + * preserve this behavior. + */ + public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, + long prevFetchTime, long prevModifiedTime, long fetchTime, + long modifiedTime, int state) { + datum.setRetriesSinceFetch(0); + return datum; + } + + /** + * This method specifies how to schedule refetching of pages marked as GONE. + * Default implementation increases fetchInterval by 50% but the value may + * never exceed <code>maxInterval</code>. + * + * @param url + * URL of the page. + * + * @param datum + * datum instance to be adjusted. + * + * @return adjusted page information, including all original information. + * NOTE: this may be a different instance than @see CrawlDatum, but + * implementations should make sure that it contains at least all + * information from @see CrawlDatum. + */ + public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum, + long prevFetchTime, long prevModifiedTime, long fetchTime) { + // no page is truly GONE ... just increase the interval by 50% + // and try much later. + if ((datum.getFetchInterval() * 1.5f) < maxInterval) + datum.setFetchInterval(datum.getFetchInterval() * 1.5f); + else + datum.setFetchInterval(maxInterval * 0.9f); + datum.setFetchTime(fetchTime + (long) datum.getFetchInterval() * 1000); + return datum; + } + + /** + * This method adjusts the fetch schedule if fetching needs to be re-tried due + * to transient errors. The default implementation sets the next fetch time 1 + * day in the future and increases the retry counter. + * + * @param url + * URL of the page. + * + * @param datum + * page information. + * + * @param prevFetchTime + * previous fetch time. + * + * @param prevModifiedTime + * previous modified time. + * + * @param fetchTime + * current fetch time. + * + * @return adjusted page information, including all original information. + * NOTE: this may be a different instance than @see CrawlDatum, but + * implementations should make sure that it contains at least all + * information from @see CrawlDatum. + */ + public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum, + long prevFetchTime, long prevModifiedTime, long fetchTime) { + datum.setFetchTime(fetchTime + (long) SECONDS_PER_DAY * 1000); + datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1); + return datum; + } + + /** + * This method return the last fetch time of the CrawlDatum + * + * @return the date as a long. + */ + public long calculateLastFetchTime(CrawlDatum datum) { + return datum.getFetchTime() - (long) datum.getFetchInterval() * 1000; + } + + /** + * This method provides information whether the page is suitable for selection + * in the current fetchlist. NOTE: a true return value does not guarantee that + * the page will be fetched, it just allows it to be included in the further + * selection process based on scores. The default implementation checks + * <code>fetchTime</code>, if it is higher than the <code>curTime</code> it + * returns false, and true otherwise. It will also check that fetchTime is not + * too remote (more than <code>maxInterval</code>, in which case it lowers the + * interval and returns true. + * + * @param url + * URL of the page. + * + * @param datum + * datum instance. + * + * @param curTime + * reference time (usually set to the time when the fetchlist + * generation process was started). + * + * @return true, if the page should be considered for inclusion in the current + * fetchlist, otherwise false. + */ + public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) { + // pages are never truly GONE - we have to check them from time to time. + // pages with too long fetchInterval are adjusted so that they fit within + // maximum fetchInterval (segment retention period). + if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) { + if (datum.getFetchInterval() > maxInterval) { + datum.setFetchInterval(maxInterval * 0.9f); + } + datum.setFetchTime(curTime); + } + if (datum.getFetchTime() > curTime) { + return false; // not time yet + } + return true; + } + + /** + * This method resets fetchTime, fetchInterval, modifiedTime, + * retriesSinceFetch and page signature, so that it forces refetching. + * + * @param url + * URL of the page. + * + * @param datum + * datum instance. + * + * @param asap + * if true, force refetch as soon as possible - this sets the + * fetchTime to now. If false, force refetch whenever the next fetch + * time is set. + */ + public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap) { + // reduce fetchInterval so that it fits within the max value + if (datum.getFetchInterval() > maxInterval) + datum.setFetchInterval(maxInterval * 0.9f); + datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); + datum.setRetriesSinceFetch(0); + datum.setSignature(null); + datum.setModifiedTime(0L); + if (asap) + datum.setFetchTime(System.currentTimeMillis()); + return datum; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java new file mode 100755 index 0000000..08cad34 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -0,0 +1,203 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.FloatWritable; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.util.NutchConfiguration; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class implements an adaptive re-fetch algorithm. This works as follows: + * <ul> + * <li>for pages that has changed since the last fetchTime, decrease their + * fetchInterval by a factor of DEC_FACTOR (default value is 0.2f).</li> + * <li>for pages that haven't changed since the last fetchTime, increase their + * fetchInterval by a factor of INC_FACTOR (default value is 0.2f).<br> + * If SYNC_DELTA property is true, then: + * <ul> + * <li>calculate a <code>delta = fetchTime - modifiedTime</code></li> + * <li>try to synchronize with the time of change, by shifting the next + * fetchTime by a fraction of the difference between the last modification time + * and the last fetch time. I.e. the next fetch time will be set to + * <code>fetchTime + fetchInterval - delta * SYNC_DELTA_RATE</code></li> + * <li>if the adjusted fetch interval is bigger than the delta, then + * <code>fetchInterval = delta</code>.</li> + * </ul> + * </li> + * <li>the minimum value of fetchInterval may not be smaller than MIN_INTERVAL + * (default is 1 minute).</li> + * <li>the maximum value of fetchInterval may not be bigger than MAX_INTERVAL + * (default is 365 days).</li> + * </ul> + * <p> + * NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize + * the algorithm, so that the fetch interval either increases or decreases + * infinitely, with little relevance to the page changes. Please use + * {@link #main(String[])} method to test the values before applying them in a + * production system. + * </p> + * + * @author Andrzej Bialecki + */ +public class AdaptiveFetchSchedule extends AbstractFetchSchedule { + + // Loggg + public static final Logger LOG = LoggerFactory + .getLogger(AbstractFetchSchedule.class); + + protected float INC_RATE; + + protected float DEC_RATE; + + private float MAX_INTERVAL; + + private float MIN_INTERVAL; + + private boolean SYNC_DELTA; + + private double SYNC_DELTA_RATE; + + public void setConf(Configuration conf) { + super.setConf(conf); + if (conf == null) + return; + INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f); + DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f); + MIN_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.min_interval", (float) 60.0); + MAX_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.max_interval", + (float) SECONDS_PER_DAY * 365); // 1 year + SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true); + SYNC_DELTA_RATE = conf.getFloat( + "db.fetch.schedule.adaptive.sync_delta_rate", 0.2f); + } + + @Override + public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, + long prevFetchTime, long prevModifiedTime, long fetchTime, + long modifiedTime, int state) { + super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime, + fetchTime, modifiedTime, state); + + float interval = datum.getFetchInterval(); + long refTime = fetchTime; + + // https://issues.apache.org/jira/browse/NUTCH-1430 + interval = (interval == 0) ? defaultInterval : interval; + + if (datum.getMetaData().containsKey(Nutch.WRITABLE_FIXED_INTERVAL_KEY)) { + // Is fetch interval preset in CrawlDatum MD? Then use preset interval + FloatWritable customIntervalWritable = (FloatWritable) (datum + .getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY)); + interval = customIntervalWritable.get(); + } else { + if (modifiedTime <= 0) + modifiedTime = fetchTime; + switch (state) { + case FetchSchedule.STATUS_MODIFIED: + interval *= (1.0f - DEC_RATE); + break; + case FetchSchedule.STATUS_NOTMODIFIED: + interval *= (1.0f + INC_RATE); + break; + case FetchSchedule.STATUS_UNKNOWN: + break; + } + if (SYNC_DELTA) { + // try to synchronize with the time of change + long delta = (fetchTime - modifiedTime) / 1000L; + if (delta > interval) + interval = delta; + refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000); + } + if (interval < MIN_INTERVAL) { + interval = MIN_INTERVAL; + } else if (interval > MAX_INTERVAL) { + interval = MAX_INTERVAL; + } + } + + datum.setFetchInterval(interval); + datum.setFetchTime(refTime + Math.round(interval * 1000.0)); + datum.setModifiedTime(modifiedTime); + return datum; + } + + public static void main(String[] args) throws Exception { + FetchSchedule fs = new AdaptiveFetchSchedule(); + fs.setConf(NutchConfiguration.create()); + // we start the time at 0, for simplicity + long curTime = 0; + long delta = 1000L * 3600L * 24L; // 2 hours + // we trigger the update of the page every 30 days + long update = 1000L * 3600L * 24L * 30L; // 30 days + boolean changed = true; + long lastModified = 0; + int miss = 0; + int totalMiss = 0; + int maxMiss = 0; + int fetchCnt = 0; + int changeCnt = 0; + // initial fetchInterval is 10 days + CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f); + p.setFetchTime(0); + LOG.info(p.toString()); + // let's move the timeline a couple of deltas + for (int i = 0; i < 10000; i++) { + if (lastModified + update < curTime) { + // System.out.println("i=" + i + ", lastModified=" + lastModified + + // ", update=" + update + ", curTime=" + curTime); + changed = true; + changeCnt++; + lastModified = curTime; + } + LOG.info(i + ". " + changed + "\twill fetch at " + + (p.getFetchTime() / delta) + "\tinterval " + + (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed " + + miss); + if (p.getFetchTime() <= curTime) { + fetchCnt++; + fs.setFetchSchedule(new Text("http://www.example.com"), p, p + .getFetchTime(), p.getModifiedTime(), curTime, lastModified, + changed ? FetchSchedule.STATUS_MODIFIED + : FetchSchedule.STATUS_NOTMODIFIED); + LOG.info("\tfetched & adjusted: " + "\twill fetch at " + + (p.getFetchTime() / delta) + "\tinterval " + + (p.getFetchInterval() / SECONDS_PER_DAY) + " days"); + if (!changed) + miss++; + if (miss > maxMiss) + maxMiss = miss; + changed = false; + totalMiss += miss; + miss = 0; + } + if (changed) + miss++; + curTime += delta; + } + LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss); + LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + + " times."); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java new file mode 100644 index 0000000..7fe3e1e --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java @@ -0,0 +1,572 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import java.io.*; +import java.util.*; +import java.util.Map.Entry; + +import org.apache.commons.jexl2.JexlContext; +import org.apache.commons.jexl2.Expression; +import org.apache.commons.jexl2.JexlEngine; +import org.apache.commons.jexl2.MapContext; + +import org.apache.hadoop.io.*; +import org.apache.nutch.util.*; + +/* The crawl state of a url. */ +public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable { + public static final String GENERATE_DIR_NAME = "crawl_generate"; + public static final String FETCH_DIR_NAME = "crawl_fetch"; + public static final String PARSE_DIR_NAME = "crawl_parse"; + + private final static byte CUR_VERSION = 7; + + /** Compatibility values for on-the-fly conversion from versions < 5. */ + private static final byte OLD_STATUS_SIGNATURE = 0; + private static final byte OLD_STATUS_DB_UNFETCHED = 1; + private static final byte OLD_STATUS_DB_FETCHED = 2; + private static final byte OLD_STATUS_DB_GONE = 3; + private static final byte OLD_STATUS_LINKED = 4; + private static final byte OLD_STATUS_FETCH_SUCCESS = 5; + private static final byte OLD_STATUS_FETCH_RETRY = 6; + private static final byte OLD_STATUS_FETCH_GONE = 7; + + private static HashMap<Byte, Byte> oldToNew = new HashMap<Byte, Byte>(); + + /** Page was not fetched yet. */ + public static final byte STATUS_DB_UNFETCHED = 0x01; + /** Page was successfully fetched. */ + public static final byte STATUS_DB_FETCHED = 0x02; + /** Page no longer exists. */ + public static final byte STATUS_DB_GONE = 0x03; + /** Page temporarily redirects to other page. */ + public static final byte STATUS_DB_REDIR_TEMP = 0x04; + /** Page permanently redirects to other page. */ + public static final byte STATUS_DB_REDIR_PERM = 0x05; + /** Page was successfully fetched and found not modified. */ + public static final byte STATUS_DB_NOTMODIFIED = 0x06; + public static final byte STATUS_DB_DUPLICATE = 0x07; + + /** Maximum value of DB-related status. */ + public static final byte STATUS_DB_MAX = 0x1f; + + /** Fetching was successful. */ + public static final byte STATUS_FETCH_SUCCESS = 0x21; + /** Fetching unsuccessful, needs to be retried (transient errors). */ + public static final byte STATUS_FETCH_RETRY = 0x22; + /** Fetching temporarily redirected to other page. */ + public static final byte STATUS_FETCH_REDIR_TEMP = 0x23; + /** Fetching permanently redirected to other page. */ + public static final byte STATUS_FETCH_REDIR_PERM = 0x24; + /** Fetching unsuccessful - page is gone. */ + public static final byte STATUS_FETCH_GONE = 0x25; + /** Fetching successful - page is not modified. */ + public static final byte STATUS_FETCH_NOTMODIFIED = 0x26; + + /** Maximum value of fetch-related status. */ + public static final byte STATUS_FETCH_MAX = 0x3f; + + /** Page signature. */ + public static final byte STATUS_SIGNATURE = 0x41; + /** Page was newly injected. */ + public static final byte STATUS_INJECTED = 0x42; + /** Page discovered through a link. */ + public static final byte STATUS_LINKED = 0x43; + /** Page got metadata from a parser */ + public static final byte STATUS_PARSE_META = 0x44; + + public static final HashMap<Byte, String> statNames = new HashMap<Byte, String>(); + static { + statNames.put(STATUS_DB_UNFETCHED, "db_unfetched"); + statNames.put(STATUS_DB_FETCHED, "db_fetched"); + statNames.put(STATUS_DB_GONE, "db_gone"); + statNames.put(STATUS_DB_REDIR_TEMP, "db_redir_temp"); + statNames.put(STATUS_DB_REDIR_PERM, "db_redir_perm"); + statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified"); + statNames.put(STATUS_DB_DUPLICATE, "db_duplicate"); + statNames.put(STATUS_SIGNATURE, "signature"); + statNames.put(STATUS_INJECTED, "injected"); + statNames.put(STATUS_LINKED, "linked"); + statNames.put(STATUS_FETCH_SUCCESS, "fetch_success"); + statNames.put(STATUS_FETCH_RETRY, "fetch_retry"); + statNames.put(STATUS_FETCH_REDIR_TEMP, "fetch_redir_temp"); + statNames.put(STATUS_FETCH_REDIR_PERM, "fetch_redir_perm"); + statNames.put(STATUS_FETCH_GONE, "fetch_gone"); + statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified"); + statNames.put(STATUS_PARSE_META, "parse_metadata"); + + oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED); + oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED); + oldToNew.put(OLD_STATUS_DB_GONE, STATUS_DB_GONE); + oldToNew.put(OLD_STATUS_FETCH_GONE, STATUS_FETCH_GONE); + oldToNew.put(OLD_STATUS_FETCH_SUCCESS, STATUS_FETCH_SUCCESS); + oldToNew.put(OLD_STATUS_FETCH_RETRY, STATUS_FETCH_RETRY); + oldToNew.put(OLD_STATUS_LINKED, STATUS_LINKED); + oldToNew.put(OLD_STATUS_SIGNATURE, STATUS_SIGNATURE); + } + + private byte status; + private long fetchTime = System.currentTimeMillis(); + private byte retries; + private int fetchInterval; + private float score = 0.0f; + private byte[] signature = null; + private long modifiedTime; + private org.apache.hadoop.io.MapWritable metaData; + + public static boolean hasDbStatus(CrawlDatum datum) { + if (datum.status <= STATUS_DB_MAX) + return true; + return false; + } + + public static boolean hasFetchStatus(CrawlDatum datum) { + if (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX) + return true; + return false; + } + + public CrawlDatum() { + } + + public CrawlDatum(int status, int fetchInterval) { + this(); + this.status = (byte) status; + this.fetchInterval = fetchInterval; + } + + public CrawlDatum(int status, int fetchInterval, float score) { + this(status, fetchInterval); + this.score = score; + } + + // + // accessor methods + // + + public byte getStatus() { + return status; + } + + public static String getStatusName(byte value) { + String res = statNames.get(value); + if (res == null) + res = "unknown"; + return res; + } + + public void setStatus(int status) { + this.status = (byte) status; + } + + /** + * Returns either the time of the last fetch, or the next fetch time, + * depending on whether Fetcher or CrawlDbReducer set the time. + */ + public long getFetchTime() { + return fetchTime; + } + + /** + * Sets either the time of the last fetch or the next fetch time, depending on + * whether Fetcher or CrawlDbReducer set the time. + */ + public void setFetchTime(long fetchTime) { + this.fetchTime = fetchTime; + } + + public long getModifiedTime() { + return modifiedTime; + } + + public void setModifiedTime(long modifiedTime) { + this.modifiedTime = modifiedTime; + } + + public byte getRetriesSinceFetch() { + return retries; + } + + public void setRetriesSinceFetch(int retries) { + this.retries = (byte) retries; + } + + public int getFetchInterval() { + return fetchInterval; + } + + public void setFetchInterval(int fetchInterval) { + this.fetchInterval = fetchInterval; + } + + public void setFetchInterval(float fetchInterval) { + this.fetchInterval = Math.round(fetchInterval); + } + + public float getScore() { + return score; + } + + public void setScore(float score) { + this.score = score; + } + + public byte[] getSignature() { + return signature; + } + + public void setSignature(byte[] signature) { + if (signature != null && signature.length > 256) + throw new RuntimeException("Max signature length (256) exceeded: " + + signature.length); + this.signature = signature; + } + + public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) { + this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable); + } + + /** + * Add all metadata from other CrawlDatum to this CrawlDatum. + * + * @param other + * CrawlDatum + */ + public void putAllMetaData(CrawlDatum other) { + for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) { + getMetaData().put(e.getKey(), e.getValue()); + } + } + + /** + * returns a MapWritable if it was set or read in @see readFields(DataInput), + * returns empty map in case CrawlDatum was freshly created (lazily + * instantiated). + */ + public org.apache.hadoop.io.MapWritable getMetaData() { + if (this.metaData == null) + this.metaData = new org.apache.hadoop.io.MapWritable(); + return this.metaData; + } + + // + // writable methods + // + + public static CrawlDatum read(DataInput in) throws IOException { + CrawlDatum result = new CrawlDatum(); + result.readFields(in); + return result; + } + + public void readFields(DataInput in) throws IOException { + byte version = in.readByte(); // read version + if (version > CUR_VERSION) // check version + throw new VersionMismatchException(CUR_VERSION, version); + + status = in.readByte(); + fetchTime = in.readLong(); + retries = in.readByte(); + if (version > 5) { + fetchInterval = in.readInt(); + } else + fetchInterval = Math.round(in.readFloat()); + score = in.readFloat(); + if (version > 2) { + modifiedTime = in.readLong(); + int cnt = in.readByte(); + if (cnt > 0) { + signature = new byte[cnt]; + in.readFully(signature); + } else + signature = null; + } + + if (version > 3) { + boolean hasMetadata = false; + if (version < 7) { + org.apache.hadoop.io.MapWritable oldMetaData = new org.apache.hadoop.io.MapWritable(); + if (in.readBoolean()) { + hasMetadata = true; + metaData = new org.apache.hadoop.io.MapWritable(); + oldMetaData.readFields(in); + } + for (Writable key : oldMetaData.keySet()) { + metaData.put(key, oldMetaData.get(key)); + } + } else { + if (in.readBoolean()) { + hasMetadata = true; + metaData = new org.apache.hadoop.io.MapWritable(); + metaData.readFields(in); + } + } + if (hasMetadata == false) + metaData = null; + } + // translate status codes + if (version < 5) { + if (oldToNew.containsKey(status)) + status = oldToNew.get(status); + else + status = STATUS_DB_UNFETCHED; + + } + } + + /** The number of bytes into a CrawlDatum that the score is stored. */ + private static final int SCORE_OFFSET = 1 + 1 + 8 + 1 + 4; + private static final int SIG_OFFSET = SCORE_OFFSET + 4 + 8; + + public void write(DataOutput out) throws IOException { + out.writeByte(CUR_VERSION); // store current version + out.writeByte(status); + out.writeLong(fetchTime); + out.writeByte(retries); + out.writeInt(fetchInterval); + out.writeFloat(score); + out.writeLong(modifiedTime); + if (signature == null) { + out.writeByte(0); + } else { + out.writeByte(signature.length); + out.write(signature); + } + if (metaData != null && metaData.size() > 0) { + out.writeBoolean(true); + metaData.write(out); + } else { + out.writeBoolean(false); + } + } + + /** Copy the contents of another instance into this instance. */ + public void set(CrawlDatum that) { + this.status = that.status; + this.fetchTime = that.fetchTime; + this.retries = that.retries; + this.fetchInterval = that.fetchInterval; + this.score = that.score; + this.modifiedTime = that.modifiedTime; + this.signature = that.signature; + if (that.metaData != null) { + this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); // make + // a + // deep + // copy + } else { + this.metaData = null; + } + } + + // + // compare methods + // + + /** Sort by decreasing score. */ + public int compareTo(CrawlDatum that) { + if (that.score != this.score) + return (that.score - this.score) > 0 ? 1 : -1; + if (that.status != this.status) + return this.status - that.status; + if (that.fetchTime != this.fetchTime) + return (that.fetchTime - this.fetchTime) > 0 ? 1 : -1; + if (that.retries != this.retries) + return that.retries - this.retries; + if (that.fetchInterval != this.fetchInterval) + return (that.fetchInterval - this.fetchInterval) > 0 ? 1 : -1; + if (that.modifiedTime != this.modifiedTime) + return (that.modifiedTime - this.modifiedTime) > 0 ? 1 : -1; + return SignatureComparator._compare(this, that); + } + + /** A Comparator optimized for CrawlDatum. */ + public static class Comparator extends WritableComparator { + public Comparator() { + super(CrawlDatum.class); + } + + public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { + float score1 = readFloat(b1, s1 + SCORE_OFFSET); + float score2 = readFloat(b2, s2 + SCORE_OFFSET); + if (score2 != score1) { + return (score2 - score1) > 0 ? 1 : -1; + } + int status1 = b1[s1 + 1]; + int status2 = b2[s2 + 1]; + if (status2 != status1) + return status1 - status2; + long fetchTime1 = readLong(b1, s1 + 1 + 1); + long fetchTime2 = readLong(b2, s2 + 1 + 1); + if (fetchTime2 != fetchTime1) + return (fetchTime2 - fetchTime1) > 0 ? 1 : -1; + int retries1 = b1[s1 + 1 + 1 + 8]; + int retries2 = b2[s2 + 1 + 1 + 8]; + if (retries2 != retries1) + return retries2 - retries1; + int fetchInterval1 = readInt(b1, s1 + 1 + 1 + 8 + 1); + int fetchInterval2 = readInt(b2, s2 + 1 + 1 + 8 + 1); + if (fetchInterval2 != fetchInterval1) + return (fetchInterval2 - fetchInterval1) > 0 ? 1 : -1; + long modifiedTime1 = readLong(b1, s1 + SCORE_OFFSET + 4); + long modifiedTime2 = readLong(b2, s2 + SCORE_OFFSET + 4); + if (modifiedTime2 != modifiedTime1) + return (modifiedTime2 - modifiedTime1) > 0 ? 1 : -1; + int sigl1 = b1[s1 + SIG_OFFSET]; + int sigl2 = b2[s2 + SIG_OFFSET]; + return SignatureComparator._compare(b1, SIG_OFFSET, sigl1, b2, + SIG_OFFSET, sigl2); + } + } + + static { // register this comparator + WritableComparator.define(CrawlDatum.class, new Comparator()); + } + + // + // basic methods + // + + public String toString() { + StringBuilder buf = new StringBuilder(); + buf.append("Version: " + CUR_VERSION + "\n"); + buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus()) + + ")\n"); + buf.append("Fetch time: " + new Date(getFetchTime()) + "\n"); + buf.append("Modified time: " + new Date(getModifiedTime()) + "\n"); + buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n"); + buf.append("Retry interval: " + getFetchInterval() + " seconds (" + + (getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n"); + buf.append("Score: " + getScore() + "\n"); + buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n"); + buf.append("Metadata: \n "); + if (metaData != null) { + for (Entry<Writable, Writable> e : metaData.entrySet()) { + buf.append("\t"); + buf.append(e.getKey()); + buf.append("="); + buf.append(e.getValue()); + buf.append("\n"); + } + } + return buf.toString(); + } + + private boolean metadataEquals(org.apache.hadoop.io.MapWritable otherMetaData) { + if (metaData == null || metaData.size() == 0) { + return otherMetaData == null || otherMetaData.size() == 0; + } + if (otherMetaData == null) { + // we already know that the current object is not null or empty + return false; + } + HashSet<Entry<Writable, Writable>> set1 = new HashSet<Entry<Writable, Writable>>( + metaData.entrySet()); + HashSet<Entry<Writable, Writable>> set2 = new HashSet<Entry<Writable, Writable>>( + otherMetaData.entrySet()); + return set1.equals(set2); + } + + public boolean equals(Object o) { + if (!(o instanceof CrawlDatum)) + return false; + CrawlDatum other = (CrawlDatum) o; + boolean res = (this.status == other.status) + && (this.fetchTime == other.fetchTime) + && (this.modifiedTime == other.modifiedTime) + && (this.retries == other.retries) + && (this.fetchInterval == other.fetchInterval) + && (SignatureComparator._compare(this.signature, other.signature) == 0) + && (this.score == other.score); + if (!res) + return res; + return metadataEquals(other.metaData); + } + + public int hashCode() { + int res = 0; + if (signature != null) { + for (int i = 0; i < signature.length / 4; i += 4) { + res ^= (signature[i] << 24 + signature[i + 1] << 16 + signature[i + 2] << 8 + signature[i + 3]); + } + } + if (metaData != null) { + res ^= metaData.entrySet().hashCode(); + } + return res ^ status ^ ((int) fetchTime) ^ ((int) modifiedTime) ^ retries + ^ fetchInterval ^ Float.floatToIntBits(score); + } + + public Object clone() { + try { + return super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + + public boolean evaluate(Expression expr) { + if (expr != null) { + // Create a context and add data + JexlContext jcontext = new MapContext(); + + // https://issues.apache.org/jira/browse/NUTCH-2229 + jcontext.set("status", getStatusName(getStatus())); + jcontext.set("fetchTime", (long)(getFetchTime())); + jcontext.set("modifiedTime", (long)(getModifiedTime())); + jcontext.set("retries", getRetriesSinceFetch()); + jcontext.set("interval", new Integer(getFetchInterval())); + jcontext.set("score", getScore()); + jcontext.set("signature", StringUtil.toHexString(getSignature())); + + // Set metadata variables + for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) { + Object value = entry.getValue(); + + if (value instanceof FloatWritable) { + FloatWritable fvalue = (FloatWritable)value; + Text tkey = (Text)entry.getKey(); + jcontext.set(tkey.toString(), fvalue.get()); + } + + if (value instanceof IntWritable) { + IntWritable ivalue = (IntWritable)value; + Text tkey = (Text)entry.getKey(); + jcontext.set(tkey.toString(), ivalue.get()); + } + + if (value instanceof Text) { + Text tvalue = (Text)value; + Text tkey = (Text)entry.getKey(); + jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString()); + } + } + + try { + if (Boolean.TRUE.equals(expr.evaluate(jcontext))) { + return true; + } + } catch (Exception e) { + // + } + } + + return false; + } +} \ No newline at end of file
