Author: tejasp
Date: Wed Jan 22 11:25:25 2014
New Revision: 1560316
URL: http://svn.apache.org/r1560316
Log:
NUTCH-1325 HostDB for Nutch
Added:
nutch/trunk/src/java/org/apache/nutch/util/hostdb/
nutch/trunk/src/java/org/apache/nutch/util/hostdb/DumpHostDb.java
nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDatum.java
nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDb.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/log4j.properties
nutch/trunk/src/bin/nutch
nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1560316&r1=1560315&r2=1560316&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan 22 11:25:25 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1325 HostDB for Nutch (markus, tejasp)
+
* NUTCH-1680 CrawlDbReader to dump minRetry value (markus)
* NUTCH-1699 Tika Parser - Image Parse Bug (Mehmet Zahid Yüzügüldü, snagel
via lewismc)
Modified: nutch/trunk/conf/log4j.properties
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1560316&r1=1560315&r2=1560316&view=diff
==============================================================================
--- nutch/trunk/conf/log4j.properties (original)
+++ nutch/trunk/conf/log4j.properties Wed Jan 22 11:25:25 2014
@@ -31,6 +31,8 @@ log4j.logger.org.apache.nutch.scoring.we
log4j.logger.org.apache.nutch.scoring.webgraph.LinkRank=INFO,cmdstdout
log4j.logger.org.apache.nutch.scoring.webgraph.Loops=INFO,cmdstdout
log4j.logger.org.apache.nutch.scoring.webgraph.ScoreUpdater=INFO,cmdstdout
+log4j.logger.org.apache.nutch..util.hostdb.HostDb=INFO,cmdstdout
+log4j.logger.org.apache.nutch..util.hostdb.DumpHostDb=INFO,cmdstdout
log4j.logger.org.apache.nutch.parse.ParserChecker=INFO,cmdstdout
log4j.logger.org.apache.nutch.indexer.IndexingFiltersChecker=INFO,cmdstdout
log4j.logger.org.apache.nutch.tools.FreeGenerator=INFO,cmdstdout
Modified: nutch/trunk/src/bin/nutch
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1560316&r1=1560315&r2=1560316&view=diff
==============================================================================
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Wed Jan 22 11:25:25 2014
@@ -66,6 +66,8 @@ if [ $# = 0 ]; then
echo " solrdedup remove duplicates from solr - DEPRECATED use the
dedup command instead"
echo " solrclean remove HTTP 301 and 404 documents from solr -
DEPRECATED use the clean command instead"
echo " clean remove HTTP 301 and 404 documents and duplicates
from indexing backends configured via plugins"
+ echo " hostdb create a HostDB (or update an earlier one) from
the CrawlDB"
+ echo " readhostdb dumps HostDB data"
echo " parsechecker check the parser for a given url"
echo " indexchecker check the indexing filters for a given url"
echo " domainstats calculate domain statistics from crawldb"
@@ -236,6 +238,10 @@ elif [ "$COMMAND" = "solrclean" ] ; then
shift; shift
elif [ "$COMMAND" = "clean" ] ; then
CLASS=org.apache.nutch.indexer.CleaningJob
+elif [ "$COMMAND" = "hostdb" ] ; then
+ CLASS=org.apache.nutch.util.hostdb.HostDb
+elif [ "$COMMAND" = "readhostdb" ] ; then
+ CLASS=org.apache.nutch.util.hostdb.DumpHostDb
elif [ "$COMMAND" = "parsechecker" ] ; then
CLASS=org.apache.nutch.parse.ParserChecker
elif [ "$COMMAND" = "indexchecker" ] ; then
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java?rev=1560316&r1=1560315&r2=1560316&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Wed Jan 22
11:25:25 2014
@@ -48,6 +48,7 @@ public class NutchWritable extends Gener
org.apache.nutch.protocol.Content.class,
org.apache.nutch.protocol.ProtocolStatus.class,
org.apache.nutch.scoring.webgraph.LinkDatum.class,
+ org.apache.nutch.util.hostdb.HostDatum.class,
};
}
Added: nutch/trunk/src/java/org/apache/nutch/util/hostdb/DumpHostDb.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/hostdb/DumpHostDb.java?rev=1560316&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/hostdb/DumpHostDb.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/util/hostdb/DumpHostDb.java Wed Jan
22 11:25:25 2014
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util.hostdb;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TimingUtil;
+
+/**
+ * A utility to dump the contents of HostDB.
+ */
+public class DumpHostDb extends Configured implements Tool {
+
+ public static final Logger LOG = LoggerFactory.getLogger(DumpHostDb.class);
+
+ public static final String HOSTDB_FAILURE_THRESHOLD =
"hostdb.failure.threshold";
+ public static final String HOSTDB_NUM_PAGES_THRESHOLD =
"hostdb.num.pages.threshold";
+ public static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd
HH:mm:ss");
+
+ static class DumpHostDbMapper extends Mapper<Text, HostDatum, Text,
HostDatum> {
+ protected Integer failureThreshold = -1;
+ protected Integer numPagesThreshold = -1;
+
+ public void setup(Context context) {
+ Configuration conf = context.getConfiguration();
+ failureThreshold = conf.getInt(HOSTDB_FAILURE_THRESHOLD, -1);
+ numPagesThreshold = conf.getInt(HOSTDB_NUM_PAGES_THRESHOLD, -1);
+ }
+
+ public void map(Text key, HostDatum datum, Context context)
+ throws IOException, InterruptedException {
+ boolean filter = false;
+
+ if (numPagesThreshold != -1 &&
(datum.getStat(CrawlDatum.STATUS_DB_FETCHED) +
+ datum.getStat(CrawlDatum.STATUS_DB_NOTMODIFIED)) < numPagesThreshold)
+ filter = true;
+ if (failureThreshold != -1 && datum.numFailures() < numPagesThreshold)
+ filter = true;
+
+ if(!filter)
+ context.write(key, datum);
+ }
+ }
+
+ private void dumpHostDb(Path hostDb, Path output, Integer failureThreshold,
+ Integer numPagesThreshold) throws Exception {
+
+ long start = System.currentTimeMillis();
+ LOG.info("HostDb dump: starting at " + sdf.format(start));
+
+ Configuration conf = getConf();
+ conf.setInt(HOSTDB_FAILURE_THRESHOLD, failureThreshold);
+ conf.setInt(HOSTDB_NUM_PAGES_THRESHOLD, numPagesThreshold);
+ conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+ Job job = new Job(conf, "DumpHostDb");
+ job.setJarByClass(DumpHostDb.class);
+
+ FileInputFormat.addInputPath(job, new Path(hostDb, "current"));
+ FileOutputFormat.setOutputPath(job, output);
+
+ job.setMapperClass(DumpHostDbMapper.class);
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ job.setOutputFormatClass(TextOutputFormat.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(HostDatum.class);
+
+ try {
+ job.waitForCompletion(true);
+ } catch (Exception e) {
+ LOG.info("Caught exception " + StringUtils.stringifyException(e));
+ throw e;
+ }
+
+ long end = System.currentTimeMillis();
+ LOG.info("HostDb dump: finished at " + sdf.format(end) + ", elapsed: " +
TimingUtil.elapsedTime(start, end));
+ }
+
+ public static void main(String args[]) throws Exception {
+ int res = ToolRunner.run(NutchConfiguration.create(), new DumpHostDb(),
args);
+ System.exit(res);
+ }
+
+ public static void usage() {
+ System.err.println("Usage: DumpHostDb <hostdb> <output>
[-numPagesThreshold <threshold>] [-dumpFailedHosts <threshold>]");
+ System.err.println("\t<hostdb>\tdirectory name where hostdb is located");
+ System.err.println("\t<output>\toutput location where the dump will be
produced");
+ System.err.println("\n Optional arguments:");
+ System.err.println("\t[-dumpFailedHosts <threshold>]\tlist status sorted
by host");
+ System.err.println("\t[-numPagesThreshold <threshold>]\tthreshold for
fetched pages of the hosts");
+ }
+
+ public int run(String[] args) throws Exception {
+ if (args.length < 2) {
+ usage();
+ return -1;
+ }
+
+ Path hostdb = new Path(args[0]);
+ Path output = new Path(args[1]);
+
+ Integer failureThreshold = -1;
+ Integer numPagesThreshold = -1;
+
+ for (int i = 2; i < args.length; i++) {
+ if (args[i].equals("-dumpFailedHosts")) {
+ failureThreshold = Integer.parseInt(args[++i]);
+ LOG.info("HostDb dump: dumping failed hosts with a threshold of " +
failureThreshold);
+ }
+ else if (args[i].equals("-numPagesThreshold")) {
+ numPagesThreshold = Integer.parseInt(args[++i]);
+ LOG.info("HostDb dump: dumping hosts with page threshold of " +
numPagesThreshold );
+ }
+ else {
+ System.err.println("HostDb dump: Found invalid argument : \"" +
args[i] + "\"\n");
+ usage();
+ return -1;
+ }
+ }
+
+ try {
+ dumpHostDb(hostdb, output, failureThreshold, numPagesThreshold);
+ return 0;
+ } catch (Exception e) {
+ LOG.error("HostDb dump: " + StringUtils.stringifyException(e));
+ return -1;
+ }
+ }
+}
Added: nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDatum.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDatum.java?rev=1560316&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDatum.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDatum.java Wed Jan 22
11:25:25 2014
@@ -0,0 +1,258 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.hostdb;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map.Entry;
+import java.text.SimpleDateFormat;
+
+import org.apache.hadoop.io.MapWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.VersionMismatchException;
+import org.apache.hadoop.io.Writable;
+import org.apache.nutch.crawl.CrawlDatum;
+
+/**
+ * Contains information of a Host
+ */
+public class HostDatum implements Writable, Cloneable {
+ private static final String EMPTY_STRING = "";
+ private static final byte CUR_VERSION = 1;
+ private static final Date DEFAULT_DATE = new Date(0);
+
+ private float score = 0;
+ private Date lastCheck = DEFAULT_DATE;
+ private String homepageUrl = EMPTY_STRING;
+
+ // Records the number of times DNS look-up failed, may indicate host no
longer exists
+ private int dnsFailures = 0;
+
+ // Records the number of connection failures, may indicate our network being
blocked by firewall
+ private int connectionFailures = 0;
+
+ // Counts for various url statuses
+ private HashMap<Byte, Integer> statCounts = new HashMap<Byte, Integer>();
+
+ private MapWritable metaData = new MapWritable();
+
+ public HostDatum() {
+ resetStatistics();
+ }
+
+ public boolean isEmpty() {
+ return lastCheck.getTime() == 0;
+ }
+
+ public float getScore() { return score; }
+ public void setScore(float score) { this.score = score; }
+
+ public Date getLastCheck() { return lastCheck; }
+ public void setLastCheck() { setLastCheck(new Date()); }
+ public void setLastCheck(Date date) { lastCheck = date; }
+
+ public boolean hasHomepageUrl() { return homepageUrl.compareTo(EMPTY_STRING)
!= 0; }
+ public String getHomepageUrl() { return homepageUrl; }
+ public void setHomepageUrl(String homepageUrl) { this.homepageUrl =
homepageUrl; }
+
+ public int getDnsFailures() { return dnsFailures; }
+ public void incDnsFailures() { this.dnsFailures++; }
+ public void setDnsFailures(int i) { this.dnsFailures = i; }
+
+ public int getConnectionFailures() { return connectionFailures; }
+ public void setConnectionFailures(int i) { this.connectionFailures = i; }
+ public int numFailures() { return getDnsFailures() +
getConnectionFailures(); }
+
+ public Integer getStat(byte key) { return statCounts.get(key); }
+ public void setStat(byte key, int val) { statCounts.put(key, val); }
+
+ public void addStat(byte key, HostDatum other) {
+ setStat(key, getStat(key) + other.getStat(key));
+ }
+
+ public Integer numRecords() {
+ return statCounts.get(CrawlDatum.STATUS_DB_UNFETCHED) +
+ statCounts.get(CrawlDatum.STATUS_DB_FETCHED) +
+ statCounts.get(CrawlDatum.STATUS_DB_NOTMODIFIED) +
+ statCounts.get(CrawlDatum.STATUS_DB_REDIR_PERM) +
+ statCounts.get(CrawlDatum.STATUS_DB_REDIR_TEMP) +
+ statCounts.get(CrawlDatum.STATUS_DB_GONE);
+ }
+
+ public void resetStatistics() {
+ statCounts.put(CrawlDatum.STATUS_DB_UNFETCHED, 0);
+ statCounts.put(CrawlDatum.STATUS_DB_FETCHED, 0);
+ statCounts.put(CrawlDatum.STATUS_DB_NOTMODIFIED, 0);
+ statCounts.put(CrawlDatum.STATUS_DB_REDIR_PERM, 0);
+ statCounts.put(CrawlDatum.STATUS_DB_REDIR_TEMP, 0);
+ statCounts.put(CrawlDatum.STATUS_DB_GONE, 0);
+ }
+
+ /**
+ * Returns a MapWritable if it was set or read in @see readFields(DataInput),
+ * Returns empty map in case CrawlDatum was freshly created (lazily
instantiated).
+ */
+ public MapWritable getMetaData() {
+ if (this.metaData == null) this.metaData = new MapWritable();
+ return this.metaData;
+ }
+
+ /**
+ * Add all metadata from other HostDatum to this HostDatum.
+ */
+ public void putAllMetaData(HostDatum other) {
+ for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) {
+ getMetaData().put(e.getKey(), e.getValue());
+ }
+ }
+
+ public void setMetaData(MapWritable mapWritable) {
+ this.metaData = new MapWritable(mapWritable);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ byte version = in.readByte();
+ if (version > CUR_VERSION) // check version
+ throw new VersionMismatchException(CUR_VERSION, version);
+
+ score = in.readFloat();
+ lastCheck = new Date(in.readLong());
+ homepageUrl = Text.readString(in);
+
+ dnsFailures = in.readInt();
+ connectionFailures = in.readInt();
+
+ statCounts.put(CrawlDatum.STATUS_DB_UNFETCHED, in.readInt());
+ statCounts.put(CrawlDatum.STATUS_DB_FETCHED, in.readInt());
+ statCounts.put(CrawlDatum.STATUS_DB_NOTMODIFIED, in.readInt());
+ statCounts.put(CrawlDatum.STATUS_DB_REDIR_PERM, in.readInt());
+ statCounts.put(CrawlDatum.STATUS_DB_REDIR_TEMP, in.readInt());
+ statCounts.put(CrawlDatum.STATUS_DB_GONE, in.readInt());
+
+ metaData = new MapWritable();
+ metaData.readFields(in);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeByte(CUR_VERSION); // store current version
+ out.writeFloat(score);
+ out.writeLong(lastCheck.getTime());
+ Text.writeString(out, homepageUrl);
+
+ out.writeInt(dnsFailures);
+ out.writeInt(connectionFailures);
+
+ out.writeInt(statCounts.get(CrawlDatum.STATUS_DB_UNFETCHED));
+ out.writeInt(statCounts.get(CrawlDatum.STATUS_DB_FETCHED));
+ out.writeInt(statCounts.get(CrawlDatum.STATUS_DB_NOTMODIFIED));
+ out.writeInt(statCounts.get(CrawlDatum.STATUS_DB_REDIR_PERM));
+ out.writeInt(statCounts.get(CrawlDatum.STATUS_DB_REDIR_TEMP));
+ out.writeInt(statCounts.get(CrawlDatum.STATUS_DB_GONE));
+
+ metaData.write(out);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder();
+ buf.append("Version: " + CUR_VERSION + "\n");
+ buf.append("Homepage url: ").append(homepageUrl).append("\n");
+ buf.append("Score: ").append(score).append("\n");
+
+ if(lastCheck != DEFAULT_DATE)
+ buf.append("Last check: ").append(new SimpleDateFormat("yyyy-MM-dd
HH:mm:ss").format(lastCheck)).append("\n");
+ else
+ buf.append("Last check: \n");
+
+ buf.append("Total records: ").append(numRecords()).append("\n");
+ buf.append(" Unfetched:
").append(statCounts.get(CrawlDatum.STATUS_DB_UNFETCHED)).append("\n");
+ buf.append(" Fetched:
").append(statCounts.get(CrawlDatum.STATUS_DB_FETCHED)).append("\n");
+ buf.append(" Gone:
").append(statCounts.get(CrawlDatum.STATUS_DB_GONE)).append("\n");
+ buf.append(" Perm redirect:
").append(statCounts.get(CrawlDatum.STATUS_DB_REDIR_PERM)).append("\n");
+ buf.append(" Temp redirect:
").append(statCounts.get(CrawlDatum.STATUS_DB_REDIR_TEMP)).append("\n");
+ buf.append(" Not modified:
").append(statCounts.get(CrawlDatum.STATUS_DB_NOTMODIFIED)).append("\n");
+
+ buf.append("Total failures: ").append(numFailures()).append("\n");
+ buf.append(" DNS failures: ").append(getDnsFailures()).append("\n");
+ buf.append(" Connection failures:
").append(getConnectionFailures()).append("\n");
+
+ return buf.toString();
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof HostDatum))
+ return false;
+
+ HostDatum other = (HostDatum) o;
+ if(this.score == other.score &&
+ this.lastCheck == other.lastCheck &&
+ this.homepageUrl.compareTo(other.homepageUrl) == 0 &&
+ this.dnsFailures == other.dnsFailures &&
+ this.connectionFailures == other.connectionFailures) {
+ for(Byte key : statCounts.keySet()) {
+ if(other.getStat(key) == null ||
other.getStat(key).equals(statCounts.get(key)))
+ return false;
+ }
+ return true;
+ }
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ return dnsFailures ^
+ homepageUrl.hashCode() ^
+ lastCheck.hashCode() ^
+ connectionFailures ^
+ Float.valueOf(score).hashCode() ^
+ statCounts.get(CrawlDatum.STATUS_DB_UNFETCHED) ^
+ statCounts.get(CrawlDatum.STATUS_DB_FETCHED) ^
+ statCounts.get(CrawlDatum.STATUS_DB_NOTMODIFIED) ^
+ statCounts.get(CrawlDatum.STATUS_DB_REDIR_PERM) ^
+ statCounts.get(CrawlDatum.STATUS_DB_REDIR_TEMP) ^
+ statCounts.get(CrawlDatum.STATUS_DB_GONE);
+ }
+
+ @Override
+ public Object clone() throws CloneNotSupportedException {
+ HostDatum result = (HostDatum)super.clone();
+ result.score = score;
+ result.lastCheck = lastCheck;
+ result.homepageUrl = homepageUrl;
+
+ result.dnsFailures = dnsFailures;
+ result.connectionFailures = connectionFailures;
+
+ result.setStat(CrawlDatum.STATUS_DB_UNFETCHED,
statCounts.get(CrawlDatum.STATUS_DB_UNFETCHED));
+ result.setStat(CrawlDatum.STATUS_DB_FETCHED,
statCounts.get(CrawlDatum.STATUS_DB_FETCHED));
+ result.setStat(CrawlDatum.STATUS_DB_NOTMODIFIED,
statCounts.get(CrawlDatum.STATUS_DB_NOTMODIFIED));
+ result.setStat(CrawlDatum.STATUS_DB_REDIR_PERM,
statCounts.get(CrawlDatum.STATUS_DB_REDIR_PERM));
+ result.setStat(CrawlDatum.STATUS_DB_REDIR_TEMP,
statCounts.get(CrawlDatum.STATUS_DB_REDIR_TEMP));
+ result.setStat(CrawlDatum.STATUS_DB_GONE,
statCounts.get(CrawlDatum.STATUS_DB_GONE));
+
+ result.metaData = metaData;
+
+ return result;
+ }
+}
Added: nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDb.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDb.java?rev=1560316&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDb.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDb.java Wed Jan 22
11:25:25 2014
@@ -0,0 +1,674 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util.hostdb;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.text.SimpleDateFormat;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.SynchronousQueue;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.util.FSUtils;
+import org.apache.nutch.util.LockUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.util.URLUtil;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Tool to create a HostDB from the CrawlDB. It aggregates fetch status values
by host and checks
+ * DNS entries for hosts.
+ */
+public class HostDb extends Configured implements Tool {
+ public static final Logger LOG = LoggerFactory.getLogger(HostDb.class);
+ public static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd
HH:mm:ss");
+ public static final String LOCK_NAME = ".locked";
+ public static final String CURRENT_NAME = "current";
+
+ public static final String HOSTDB_PURGE_FAILED_HOSTS_THRESHOLD =
"hostdb.purge.failed.hosts.threshold";
+ public static final String HOSTDB_NUM_RESOLVER_THREADS =
"hostdb.num.resolvers.threads";
+ public static final String HOSTDB_RECHECK_INTERVAL =
"hostdb.recheck.interval";
+ public static final String HOSTDB_CHECK_FAILED = "hostdb.check.failed";
+ public static final String HOSTDB_CHECK_NEW = "hostdb.check.new";
+ public static final String HOSTDB_CHECK_KNOWN = "hostdb.check.known";
+ public static final String HOSTDB_FORCE_CHECK = "hostdb.force.check";
+ public static final String HOSTDB_URL_FILTERING = "hostdb.url.filter";
+ public static final String HOSTDB_URL_NORMALIZING = "hostdb.url.normalize";
+
+ /**
+ * Mapper ingesting HostDB and CrawlDB entries. Additionally it can also
read host score info
+ * from a plain text key/value file generated by the Webgraph's NodeDumper
tool.
+ */
+ public static class HostDbMapper extends Mapper<Text, Writable, Text,
NutchWritable> {
+ private Text host = new Text();
+ private HostDatum hostDatum = null;
+ private CrawlDatum crawlDatum = null;
+ private String reprUrl = null;
+ private String buffer = null;
+ private boolean filter = false;
+ private boolean normalize = false;
+ private boolean readingCrawlDb = false;
+ private URLFilters filters = null;
+ private URLNormalizers normalizers = null;
+
+ public void setup(Context context) {
+ Configuration conf = context.getConfiguration();
+ readingCrawlDb = conf.getBoolean("hostdb.reading.crawldb", false);
+ filter = conf.getBoolean(HOSTDB_URL_FILTERING, false);
+ normalize = conf.getBoolean(HOSTDB_URL_NORMALIZING, false);
+
+ if (filter)
+ filters = new URLFilters(conf);
+
+ if (normalize)
+ normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
+ }
+
+ /* Filters and or normalizes the input URL */
+ private String filterNormalize(String u) {
+ boolean isHost = false;
+ String url = u;
+
+ if(!u.startsWith("http://") && !u.startsWith("https://")) {
+ // We received a hostname here so let's make a URL
+ url = "http://" + u + "/";
+ isHost = true;
+ }
+
+ try {
+ if (normalizers != null)
+ url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+
+ if (filters != null)
+ url = filters.filter(url);
+
+ if (isHost && url == null) {
+ // All hosts may not allow HTTP scheme and just allow HTTPS scheme.
+ // So, try to force HTTPS for domains which are filtered with HTTP
scheme
+ // Note that this is a hacky way of getting around and does not work
+ // for FTP and FILE schemes.
+ String httpsUrl = "https://" + u + "/";
+ if (normalizers != null)
+ httpsUrl = normalizers.normalize(httpsUrl,
URLNormalizers.SCOPE_DEFAULT);
+
+ if (filters != null)
+ httpsUrl = filters.filter(httpsUrl);
+
+ url = httpsUrl;
+ }
+ } catch (Exception e) {
+ return null;
+ }
+ return url;
+ }
+
+ /**
+ * Mapper ingesting records from the HostDB, CrawlDB and plain-text host
scores
+ * file. Statistics and scores are passed on.
+ */
+ public void map(Text key, Writable value, Context context) throws
IOException, InterruptedException {
+
+ if (value instanceof CrawlDatum) {
+ // This is a record from the CrawlDB
+ // Get the normalized and filtered host of this URL
+ buffer = filterNormalize(URLUtil.getHost(key.toString()));
+
+ // Filtered out?
+ if (buffer == null) {
+ context.getCounter("HostDb", "filtered_records").increment(1);
+ LOG.info(URLUtil.getHost(key.toString()) + " crawldatum has been
filtered");
+ return;
+ }
+
+ // Set the host of this URL
+ host.set(buffer);
+ crawlDatum = (CrawlDatum)value;
+ hostDatum = new HostDatum();
+
+ /**
+ * Known limitation:
+ * multi redirects: host_a => host_b/page => host_c/page/whatever
+ *
+ * We cannot re-resolve redirects for host objects as CrawlDatum
metadata is
+ * not available. We also cannot reliably use the reducer in all cases
since
+ * redirects may be across hosts or even domains. For now saving this
for future
+ * as multi-redirects are not very common on the entire internet.
+ */
+
+ // Check if the current key is equals the host
+ if (key.toString().equals("http://" + buffer + "/")) {
+ // Check if this is a redirect to the real home page
+ if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM ||
+ crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
+
+ // Obtain the repr url for this redirect via protocol status from
the metadata
+ ProtocolStatus z =
(ProtocolStatus)crawlDatum.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
+
+ // Get the protocol status' arguments
+ reprUrl = z.getArgs()[0];
+
+ if (reprUrl != null) {
+ LOG.info("Homepage: " + key.toString() + " redirects to: " +
reprUrl);
+ hostDatum.setHomepageUrl(reprUrl);
+ } else {
+ LOG.info("Homepage: " + key.toString() +
+ " redirects to: " + reprUrl + " but has been filtered out");
+ }
+ } else {
+ hostDatum.setHomepageUrl("http://" + buffer + "/");
+ LOG.info("Homepage: " + "http://" + buffer + "/");
+ }
+ }
+
+ hostDatum.setStat(crawlDatum.getStatus(), 1);
+ context.write(host, new NutchWritable(hostDatum));
+ }
+ else if (value instanceof HostDatum) { // we got a record from the
hostdb
+ buffer = filterNormalize(key.toString());
+
+ // Filtered out?
+ if (buffer == null) {
+ context.getCounter("HostDb", "filtered_records").increment(1);
+ LOG.info(key.toString() + " hostdatum has been filtered");
+ return;
+ }
+
+ // Get a HostDatum
+ hostDatum = (HostDatum)value;
+ key.set(buffer);
+
+ // If we're also reading CrawlDb entries, reset db_* statistics because
+ // we're aggregating them from CrawlDB anyway
+ if (readingCrawlDb)
+ hostDatum.resetStatistics();
+
+ context.write(key, new NutchWritable(hostDatum));
+ }
+ else if (value instanceof Text) { // we got a record with host
scores
+ buffer = filterNormalize(key.toString());
+
+ // Filtered out?
+ if (buffer == null) {
+ context.getCounter("HostDb", "filtered_records").increment(1);
+ LOG.info(key.toString() + " score has been filtered");
+ return;
+ }
+
+ key.set(buffer);
+ context.write(key,
+ new NutchWritable(new
FloatWritable(Float.parseFloat(value.toString()))));
+ }
+ }
+ }
+
+ static class HostDbReducer extends Reducer<Text, NutchWritable, Text,
HostDatum> {
+ private ResolverThread resolverThread = null;
+
+ private Integer numResolverThreads = 10;
+ private static Integer purgeFailedHostsThreshold = -1;
+ private static Integer recheckInterval = 86400000;
+ private static boolean checkFailed = false;
+ private static boolean checkNew = false;
+ private static boolean checkKnown = false;
+ private static boolean force = false;
+ private static long now = new Date().getTime();
+
+ private BlockingQueue<Runnable> queue = new SynchronousQueue<Runnable>();
+ private ThreadPoolExecutor executor = null;
+
+ /**
+ * Configures the thread pool and prestarts all resolver threads.
+ */
+ public void setup(Context context) {
+ Configuration conf = context.getConfiguration();
+ purgeFailedHostsThreshold =
conf.getInt(HOSTDB_PURGE_FAILED_HOSTS_THRESHOLD, -1);
+ numResolverThreads = conf.getInt(HOSTDB_NUM_RESOLVER_THREADS, 10);
+ recheckInterval = conf.getInt(HOSTDB_RECHECK_INTERVAL, 86400) * 1000;
+ checkFailed = conf.getBoolean(HOSTDB_CHECK_FAILED, false);
+ checkNew = conf.getBoolean(HOSTDB_CHECK_NEW, false);
+ checkKnown = conf.getBoolean(HOSTDB_CHECK_KNOWN, false);
+ force = conf.getBoolean(HOSTDB_FORCE_CHECK, false);
+
+ // Initialize the thread pool with our queue
+ executor = new ThreadPoolExecutor(numResolverThreads,
numResolverThreads, 5, TimeUnit.SECONDS, queue);
+
+ // Run all threads in the pool
+ executor.prestartAllCoreThreads();
+ }
+
+ public void reduce(Text key, Iterable<NutchWritable> values, Context
context)
+ throws IOException, InterruptedException {
+
+ HostDatum hostDatum = new HostDatum();
+ float score = 0;
+
+ // Loop through all values until we find a non-empty HostDatum or use an
empty if this is a new host for the host db
+ for(Writable value : values) {
+ if (value instanceof HostDatum) {
+ HostDatum buffer = (HostDatum) value;
+
+ // Increment statistics only if this is not an existing HostDatum
+ if (hostDatum.isEmpty()) {
+ hostDatum.addStat(CrawlDatum.STATUS_DB_UNFETCHED, buffer);
+ hostDatum.addStat(CrawlDatum.STATUS_DB_FETCHED, buffer);
+ hostDatum.addStat(CrawlDatum.STATUS_DB_GONE, buffer);
+ hostDatum.addStat(CrawlDatum.STATUS_DB_REDIR_PERM, buffer);
+ hostDatum.addStat(CrawlDatum.STATUS_DB_REDIR_TEMP, buffer);
+ hostDatum.addStat(CrawlDatum.STATUS_DB_NOTMODIFIED, buffer);
+ }
+
+ // Check homepage URL
+ if (buffer.hasHomepageUrl())
+ hostDatum.setHomepageUrl(buffer.getHomepageUrl());
+
+ // Check lastCheck timestamp
+ if (!buffer.isEmpty())
+ hostDatum.setLastCheck(buffer.getLastCheck());
+
+ // Check and set failures
+ if (buffer.getDnsFailures() > 0)
+ hostDatum.setDnsFailures(buffer.getDnsFailures());
+
+ // Check and set failures
+ if (buffer.getConnectionFailures() > 0)
+ hostDatum.setConnectionFailures(buffer.getConnectionFailures());
+
+ // Check and set score (score from Web Graph has precedence)
+ if (buffer.getScore() > 0)
+ hostDatum.setScore(buffer.getScore());
+ }
+
+ // Check for the score
+ if (value instanceof FloatWritable) {
+ FloatWritable buffer = (FloatWritable)value;
+ score = buffer.get();
+ }
+ }
+
+ // Check if score was set from Web Graph
+ if (score > 0)
+ hostDatum.setScore(score);
+
+ context.getCounter("HostDb", "total_hosts").increment(1);
+
+ // See if this record is to be checked
+ if (shouldCheck(hostDatum)) {
+ // Make an entry
+ resolverThread = new ResolverThread(key.toString(), hostDatum,
context);
+
+ // Add the entry to the queue (blocking)
+ try {
+ queue.put(resolverThread);
+ } catch (InterruptedException e) {
+ LOG.error("HostDb: " + StringUtils.stringifyException(e));
+ }
+
+ // Do not progress, the datum will be written in the resolver thread
+ return;
+ } else {
+ context.getCounter("HostDb", "skipped_not_eligible").increment(1);
+ LOG.info(key.toString() + ": skipped_not_eligible");
+ }
+
+ // Write the host datum if it wasn't written by the resolver thread
+ context.write(key, hostDatum);
+ }
+
+ /**
+ * Determines whether a record should be checked.
+ */
+ private boolean shouldCheck(HostDatum datum) {
+ // Whether a new record is to be checked
+ if (checkNew && datum.isEmpty()) {
+ return true;
+ }
+
+ // Whether existing known hosts should be rechecked
+ if (checkKnown && !datum.isEmpty() && datum.getDnsFailures() == 0) {
+ return isEligibleForCheck(datum);
+ }
+
+ // Whether failed records are forced to be rechecked
+ if (checkFailed && datum.getDnsFailures() > 0) {
+ return isEligibleForCheck(datum);
+ }
+
+ // It seems this record is not to be checked
+ return false;
+ }
+
+ /**
+ * Determines whether a record is eligible for recheck
+ */
+ private boolean isEligibleForCheck(HostDatum datum) {
+ // Whether an existing host, known or unknown, if forced to be rechecked
+ return (force || datum.getLastCheck().getTime() + (recheckInterval *
datum.getDnsFailures() + 1) < now);
+ }
+
+ /**
+ * Shut down all running threads and wait for completion.
+ */
+ public void close() {
+ LOG.info("Feeder finished, waiting for shutdown");
+
+ // If we're here all keys have been fed and we can issue a shut down
+ executor.shutdown();
+
+ boolean finished = false;
+
+ while (!finished) {
+ try {
+ // Wait for the executor to shut down completely
+ if (!executor.isTerminated()) {
+ LOG.info("Threads waiting: " +
Integer.toString(executor.getPoolSize()));
+ Thread.sleep(1000);
+ } else {
+ // All is well, get out
+ finished = true;
+ }
+ } catch (InterruptedException e) {
+ LOG.warn(StringUtils.stringifyException(e));
+ }
+ }
+ }
+
+ static class ResolverThread implements Runnable {
+ private String host = null;
+ private HostDatum datum = null;
+ private Text hostText = new Text();
+ private Context context = null;
+
+ public ResolverThread(String host, HostDatum datum, Context context) {
+ hostText.set(host);
+ this.host = host;
+ this.datum = datum;
+ this.context = context;
+ }
+
+ public void run() {
+ // Resolve the host and act appropriately
+ datum.setLastCheck();
+ try {
+ // Throws an exception if host is not found
+ InetAddress.getByName(host);
+
+ if (datum.isEmpty()) {
+ context.getCounter("HostDb", "new_known_host").increment(1);
+ LOG.info(host + ": new_known_host " + datum);
+ } else if (datum.getDnsFailures() > 0) {
+ context.getCounter("HostDb", "rediscovered_host").increment(1);
+ datum.setDnsFailures(0);
+ LOG.info(host + ": rediscovered_host " + datum);
+ } else {
+ context.getCounter("HostDb", "existing_known_host").increment(1);
+ LOG.info(host + ": existing_known_host " + datum);
+ }
+ // Write the host datum
+ context.write(hostText, datum);
+ } catch (UnknownHostException e) {
+ try {
+ // If the counter is empty we'll initialize with date = today and
1 failure
+ if (datum.isEmpty()) {
+ datum.setDnsFailures(1);
+ context.write(hostText, datum);
+ context.getCounter("HostDb", "new_unknown_host").increment(1);
+ LOG.info(host + ": new_unknown_host " + datum);
+ } else {
+ datum.incDnsFailures();
+
+ // Check if this host should be forgotten
+ if (purgeFailedHostsThreshold == -1 ||
+ purgeFailedHostsThreshold < datum.getDnsFailures()) {
+ context.write(hostText, datum);
+ context.getCounter("HostDb",
"existing_unknown_host").increment(1);
+ LOG.info(host + ": existing_unknown_host " + datum);
+ } else {
+ context.getCounter("HostDb",
"purged_unknown_host").increment(1);
+ LOG.info(host + ": purged_unknown_host " + datum);
+ }
+ }
+
+ context.getCounter("HostDb",
+ Integer.toString(datum.numFailures()) +
"_times_failed").increment(1);
+ } catch (Exception ioe) {
+ LOG.warn(StringUtils.stringifyException(ioe));
+ }
+ } catch (Exception e) {
+ LOG.warn(StringUtils.stringifyException(e));
+ }
+ context.getCounter("HostDb", "checked_hosts").increment(1);
+ }
+ }
+ }
+
+ private void hostDb(Path hostDb, Path crawlDb, Path topHosts,
+ boolean checkFailed, boolean checkNew, boolean
checkKnown,
+ boolean force, boolean filter, boolean normalize)
throws Exception {
+
+ long start = System.currentTimeMillis();
+ LOG.info("HostDb: starting at " + sdf.format(start));
+
+ Configuration conf = getConf();
+ conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+ conf.setBoolean(HOSTDB_CHECK_FAILED, checkFailed);
+ conf.setBoolean(HOSTDB_CHECK_NEW, checkNew);
+ conf.setBoolean(HOSTDB_CHECK_KNOWN, checkKnown);
+ conf.setBoolean(HOSTDB_FORCE_CHECK, force);
+ conf.setBoolean(HOSTDB_URL_FILTERING, filter);
+ conf.setBoolean(HOSTDB_URL_NORMALIZING, normalize);
+
+ // Check whether the urlfilter-domainblacklist plugin is loaded
+ if ("urlfilter-domainblacklist".matches(conf.get("plugin.includes"))) {
+ throw new Exception("domainblacklist-urlfilter must not be enabled");
+ }
+
+ // Check whether the urlnormalizer-host plugin is loaded
+ if ("urlnormalizer-host".matches(conf.get("plugin.includes"))) {
+ throw new Exception("urlnormalizer-host must not be enabled");
+ }
+
+ FileSystem fs = FileSystem.get(conf);
+ Path old = new Path(hostDb, "old");
+ Path current = new Path(hostDb, CURRENT_NAME);
+ Path tempHostDb = new Path(hostDb, "hostdb-"
+ + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+
+ // lock an existing hostdb to prevent multiple simultaneous updates
+ Path lock = new Path(hostDb, LOCK_NAME);
+ if (!fs.exists(current)) {
+ fs.mkdirs(current);
+ }
+ LockUtil.createLockFile(fs, lock, false);
+
+ Job job = new Job(conf, "HostDb " + hostDb);
+ job.setJarByClass(HostDb.class);
+ job.setSpeculativeExecution(false);
+
+ MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class);
+
+ if (topHosts != null) {
+ MultipleInputs.addInputPath(job, topHosts,
KeyValueTextInputFormat.class);
+ }
+ if (crawlDb != null) {
+ // Tell the job we read from CrawlDB
+ conf.setBoolean("hostdb.reading.crawldb", true);
+ MultipleInputs.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME),
+ SequenceFileInputFormat.class);
+ }
+
+ FileOutputFormat.setOutputPath(job, tempHostDb);
+
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(NutchWritable.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(HostDatum.class);
+ job.setMapperClass(HostDbMapper.class);
+ job.setReducerClass(HostDbReducer.class);
+
+ try {
+ job.waitForCompletion(true);
+
+ FSUtils.replace(fs, old, current, true);
+ FSUtils.replace(fs, current, tempHostDb, true);
+
+ boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
+ if (!preserveBackup && fs.exists(old)) fs.delete(old, true);
+ } catch (Exception e) {
+ if (fs.exists(tempHostDb)) {
+ fs.delete(tempHostDb, true);
+ }
+ LockUtil.removeLockFile(fs, lock);
+ throw e;
+ }
+
+ LockUtil.removeLockFile(fs, lock);
+ long end = System.currentTimeMillis();
+ LOG.info("HostDb: finished at " + sdf.format(end) +
+ ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ }
+
+ public static void main(String args[]) throws Exception {
+ int res = ToolRunner.run(NutchConfiguration.create(), new HostDb(), args);
+ System.exit(res);
+ }
+
+ public static void usage() {
+ System.err.println("Usage: HostDb <hostdb> " +
+ "[-crawldb <crawldb>] [-tophosts <tophosts>] [-checkAll]
[-checkFailed]" +
+ " [-checkNew] [-checkKnown] [-force] [-noFilter] [-noNormalize]");
+ System.err.println("\t<hostdb>\tdirectory name where hostdb is located");
+ System.err.println("\t-crawldb <crawldb>\tpath to a crawldb directory");
+ System.err.println("\t-tophosts <tophosts>\tkey-value text file from the
Webgraph's NodeDumper tool having score");
+ System.err.println("\t-checkAll\tApply DNS check to resolve all hosts");
+ System.err.println("\t-checkFailed\tApply DNS check to resolve only on
hosts which had failed DNS check earlier");
+ System.err.println("\t-checkNew\tApply DNS check to resolve only new
hosts");
+ System.err.println("\t-checkKnown\tApply DNS check to resolve only known
hosts");
+ System.err.println("\t-force\t\tforce hosts to be rechecked. With earlier
args, check " +
+ "is done on host only if 'recheckInterval' has elapsed.");
+ System.err.println("\t-noFilter\tturn off URLFilters on urls");
+ System.err.println("\t-noNormalize\tturn off URLNormalizer on urls");
+ }
+
+ public int run(String[] args) throws Exception {
+ if (args.length < 2) {
+ usage();
+ return -1;
+ }
+
+ Path hostDb = new Path(args[0]);
+ Path crawlDb = null;
+ Path topHosts = null;
+
+ boolean checkFailed = false;
+ boolean checkNew = false;
+ boolean checkKnown = false;
+ boolean force = false;
+
+ boolean filter = true;
+ boolean normalize = true;
+
+ for (int i = 1; i < args.length; i++) {
+ if (args[i].equals("-crawldb")) {
+ crawlDb = new Path(args[++i]);
+ LOG.info("HostDb: crawldb: " + crawlDb);
+ }
+ else if (args[i].equals("-tophosts")) {
+ topHosts = new Path(args[++i]);
+ LOG.info("HostDb: tophosts: " + topHosts);
+ }
+ else if (args[i].equals("-checkFailed")) {
+ LOG.info("HostDb: checking failed hosts");
+ checkFailed = true;
+ }
+ else if (args[i].equals("-checkNew")) {
+ LOG.info("HostDb: checking new hosts");
+ checkNew = true;
+ }
+ else if (args[i].equals("-checkKnown")) {
+ LOG.info("HostDb: checking known hosts");
+ checkKnown = true;
+ }
+ else if (args[i].equals("-checkAll")) {
+ LOG.info("HostDb: checking all hosts");
+ checkFailed = true;
+ checkNew = true;
+ checkKnown = true;
+ }
+ else if (args[i].equals("-force")) {
+ LOG.info("HostDb: forced check");
+ force = true;
+ }
+ else if (args[i].equals("-noFilter")) {
+ LOG.info("HostDb: filtering disabled");
+ filter = false;
+ }
+ else if (args[i].equals("-noNormalize")) {
+ LOG.info("HostDb: normalizing disabled");
+ normalize = false;
+ }
+ else {
+ LOG.info("HostDb: Found invalid argument \"" + args[i] + "\"\n");
+ usage();
+ return -1;
+ }
+ }
+
+ try {
+ hostDb(hostDb, crawlDb, topHosts, checkFailed, checkNew, checkKnown,
force, filter, normalize);
+ return 0;
+ } catch (Exception e) {
+ LOG.error("HostDb: " + StringUtils.stringifyException(e));
+ return -1;
+ }
+ }
+}