Author: kubes Date: Tue Dec 2 06:50:37 2008 New Revision: 722478 URL: http://svn.apache.org/viewvc?rev=722478&view=rev Log: NUTCH-647: Resolve URLs tool
Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java?rev=722478&view=auto ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java Tue Dec 2 06:50:37 2008 @@ -0,0 +1,184 @@ +package org.apache.nutch.tools; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.net.InetAddress; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.util.URLUtil; + +/** + * A simple tool that will spin up multiple threads to resolve urls to ip + * addresses. This can be used to verify that pages that are failing due to + * UnknownHostException during fetching are actually bad and are not failing due + * to a dns problem in fetching. + */ +public class ResolveUrls { + + public static final Log LOG = LogFactory.getLog(ResolveUrls.class); + + private String urlsFile = null; + private int numThreads = 100; + private ExecutorService pool = null; + private static AtomicInteger numTotal = new AtomicInteger(0); + private static AtomicInteger numErrored = new AtomicInteger(0); + private static AtomicInteger numResolved = new AtomicInteger(0); + private static AtomicLong totalTime = new AtomicLong(0L); + + /** + * A Thread which gets the ip address of a single host by name. + */ + private static class ResolverThread + extends Thread { + + private String url = null; + + public ResolverThread(String url) { + this.url = url; + } + + public void run() { + + numTotal.incrementAndGet(); + String host = URLUtil.getHost(url); + long start = System.currentTimeMillis(); + try { + + // get the address by name and if no error is thrown then it + // is resolved successfully + InetAddress ia = InetAddress.getByName(host); + LOG.info("Resolved: " + host); + numResolved.incrementAndGet(); + } + catch (Exception uhe) { + LOG.info("Error Resolving: " + host); + numErrored.incrementAndGet(); + } + long end = System.currentTimeMillis(); + long total = (end - start); + totalTime.addAndGet(total); + LOG.info(", " + total + " millis"); + } + } + + /** + * Creates a thread pool for resolving urls. Reads in the url file on the + * local filesystem. For each url it attempts to resolve it keeping a total + * account of the number resolved, errored, and the amount of time. + */ + public void resolveUrls() { + + try { + + // create a thread pool with a fixed number of threads + pool = Executors.newFixedThreadPool(numThreads); + + // read in the urls file and loop through each line, one url per line + BufferedReader buffRead = new BufferedReader(new FileReader(new File( + urlsFile))); + String urlStr = null; + while ((urlStr = buffRead.readLine()) != null) { + + // spin up a resolver thread per url + LOG.info("Starting: " + urlStr); + pool.execute(new ResolverThread(urlStr)); + } + + // close the file and wait for up to 60 seconds before shutting down + // the thread pool to give urls time to finish resolving + buffRead.close(); + pool.awaitTermination(60, TimeUnit.SECONDS); + } + catch (Exception e) { + + // on error shutdown the thread pool immediately + pool.shutdownNow(); + LOG.info(StringUtils.stringifyException(e)); + } + + // shutdown the thread pool and log totals + pool.shutdown(); + LOG.info("Total: " + numTotal.get() + ", Resovled: " + + numResolved.get() + ", Errored: " + numErrored.get() + + ", Average Time: " + totalTime.get() / numTotal.get()); + } + + /** + * Create a new ResolveUrls with a file from the local file system. + * + * @param urlsFile The local urls file, one url per line. + */ + public ResolveUrls(String urlsFile) { + this(urlsFile, 100); + } + + /** + * Create a new ResolveUrls with a urls file and a number of threads for the + * Thread pool. Number of threads is 100 by default. + * + * @param urlsFile The local urls file, one url per line. + * @param numThreads The number of threads used to resolve urls in parallel. + */ + public ResolveUrls(String urlsFile, int numThreads) { + this.urlsFile = urlsFile; + this.numThreads = numThreads; + } + + /** + * Runs the resolve urls tool. + */ + public static void main(String[] args) { + + Options options = new Options(); + Option helpOpts = OptionBuilder.withArgName("help").withDescription( + "show this help message").create("help"); + Option urlOpts = OptionBuilder.withArgName("urls").hasArg().withDescription( + "the urls file to check").create("urls"); + Option numThreadOpts = OptionBuilder.withArgName("numThreads").hasArgs().withDescription( + "the number of threads to use").create("numThreads"); + options.addOption(helpOpts); + options.addOption(urlOpts); + options.addOption(numThreadOpts); + + CommandLineParser parser = new GnuParser(); + try { + + // parse out common line arguments + CommandLine line = parser.parse(options, args); + if (line.hasOption("help") || !line.hasOption("urls")) { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp("ResolveUrls", options); + return; + } + + // get the urls and the number of threads and start the resolver + String urls = line.getOptionValue("urls"); + int numThreads = 100; + String numThreadsStr = line.getOptionValue("numThreads"); + if (numThreadsStr != null) { + numThreads = Integer.parseInt(numThreadsStr); + } + ResolveUrls resolve = new ResolveUrls(urls, numThreads); + resolve.resolveUrls(); + } + catch (Exception e) { + LOG.fatal("ResolveUrls: " + StringUtils.stringifyException(e)); + } + } + +}