[nutch] branch master updated: NUTCH-2429 Fix Plugin System to allow protocol plugins to bundle their URLStreamHandlers (#720)

lewismc Fri, 07 Jan 2022 20:09:36 -0800

This is an automated email from the ASF dual-hosted git repository.

lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git



The following commit(s) were added to refs/heads/master by this push:
     new e76d69f  NUTCH-2429 Fix Plugin System to allow protocol plugins to 
bundle their URLStreamHandlers (#720)
e76d69f is described below

commit e76d69fe13902fd2f3a98660dd2bac52c2ea568c
Author: Lewis John McGibbney <[email protected]>
AuthorDate: Fri Jan 7 20:07:54 2022 -0800

    NUTCH-2429 Fix Plugin System to allow protocol plugins to bundle their 
URLStreamHandlers (#720)
    
    * NUTCH-2429 Fix Plugin System to allow protocol plugins to bundle their 
URLStreamHandlers
    
    Co-authored-by: Hiran Chaudhuri <[email protected]>
---
 build.xml                                          |   1 +
 src/java/org/apache/nutch/crawl/CrawlDbReader.java |  43 ++--
 src/java/org/apache/nutch/parse/ParserChecker.java |   5 +
 .../apache/nutch/plugin/PluginManifestParser.java  |  66 +++---
 .../org/apache/nutch/plugin/PluginRepository.java  | 244 +++++++++++++++------
 .../nutch/plugin/URLStreamHandlerFactory.java      | 115 ++++++++++
 .../apache/nutch/util/CrawlCompletionStats.java    |  40 ++--
 src/java/org/apache/nutch/util/NutchJob.java       |  12 +-
 src/java/org/apache/nutch/util/NutchTool.java      |   9 +
 .../org/apache/nutch/util/SitemapProcessor.java    |  10 +-
 .../apache/nutch/util/domain/DomainStatistics.java |  20 +-
 .../apache/nutch/any23/Any23IndexingFilter.java    |   2 +-
 .../org/apache/nutch/any23/Any23ParseFilter.java   |   2 +-
 src/plugin/build.xml                               |   2 +
 .../nutch/indexwriter/csv/CSVIndexWriter.java      |   2 +-
 .../indexwriter/rabbit/RabbitIndexWriter.java      |   2 +-
 src/plugin/protocol-foo/build.xml                  |  22 ++
 src/plugin/protocol-foo/ivy.xml                    |  41 ++++
 src/plugin/protocol-foo/plugin.xml                 |  48 ++++
 .../java/org/apache/nutch/protocol/foo/Foo.java    | 141 ++++++++++++
 .../org/apache/nutch/protocol/foo/Handler.java     |  28 +++
 21 files changed, 696 insertions(+), 159 deletions(-)

diff --git a/build.xml b/build.xml
index ecef1e7..2c0eef0 100644
--- a/build.xml
+++ b/build.xml
@@ -1272,6 +1272,7 @@
         <source path="${plugins.dir}/parsefilter-regex/src/test/" />
         <source path="${plugins.dir}/protocol-file/src/java/" />
         <source path="${plugins.dir}/protocol-file/src/test/" />
+        <source path="${plugins.dir}/protocol-foo/src/java/" />
         <source path="${plugins.dir}/protocol-ftp/src/java/" />
         <source path="${plugins.dir}/protocol-htmlunit/src/java/" />
         <source path="${plugins.dir}/protocol-http/src/java/" />
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java 
b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 2a20a56..f31210a 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -16,11 +16,12 @@
  */
 package org.apache.nutch.crawl;
 
+import java.io.Closeable;
 import java.io.DataOutputStream;
 import java.io.File;
 import java.io.IOException;
-import java.io.Closeable;
 import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
 import java.net.URL;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
@@ -32,16 +33,11 @@ import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Random;
+import java.util.TreeMap;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-import java.util.TreeMap;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.tdunning.math.stats.MergingDigest;
-import com.tdunning.math.stats.TDigest;
 
+import org.apache.commons.jexl3.JexlScript;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
@@ -55,18 +51,18 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.RecordWriter;
 import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.util.ToolRunner;
 import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.util.AbstractChecker;
 import org.apache.nutch.util.JexlUtil;
 import org.apache.nutch.util.NutchConfiguration;
@@ -74,7 +70,8 @@ import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.SegmentReaderUtil;
 import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.util.TimingUtil;
-import org.apache.commons.jexl3.JexlScript;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import com.fasterxml.jackson.core.JsonGenerationException;
 import com.fasterxml.jackson.core.JsonGenerator;
@@ -84,6 +81,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.ObjectWriter;
 import com.fasterxml.jackson.databind.SerializerProvider;
 import com.fasterxml.jackson.databind.module.SimpleModule;
+import com.tdunning.math.stats.MergingDigest;
+import com.tdunning.math.stats.TDigest;
 
 /**
  * Read utility for the CrawlDB.
@@ -375,10 +374,14 @@ public class CrawlDbReader extends AbstractChecker 
implements Closeable {
       context.write(new Text("fit"), fetchInterval);
 
       if (sort) {
-        URL u = new URL(key.toString());
-        String host = u.getHost();
-        context.write(new Text("status " + value.getStatus() + " " + host),
-            COUNT_1);
+        try {
+          URL u = new URL(key.toString());
+          String host = u.getHost();
+          context.write(new Text("status " + value.getStatus() + " " + host),
+              COUNT_1);
+        } catch (MalformedURLException e) {
+          LOG.error("Failed to get host from URL {}: {}", key.toString(), 
e.getMessage());
+        }
       }
     }
   }
diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java 
b/src/java/org/apache/nutch/parse/ParserChecker.java
index 7b0e76a..6c82a51 100644
--- a/src/java/org/apache/nutch/parse/ParserChecker.java
+++ b/src/java/org/apache/nutch/parse/ParserChecker.java
@@ -28,6 +28,7 @@ import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.plugin.PluginRepository;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.scoring.ScoringFilters;
@@ -106,6 +107,10 @@ public class ParserChecker extends AbstractChecker {
       System.exit(-1);
     }
 
+    // initialize plugins early to register URL stream handlers to support
+    // custom protocol implementations
+    PluginRepository.get(getConf());
+
     int numConsumed;
     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("-normalize")) {
diff --git a/src/java/org/apache/nutch/plugin/PluginManifestParser.java 
b/src/java/org/apache/nutch/plugin/PluginManifestParser.java
index d7280ad..4c845b4 100644
--- a/src/java/org/apache/nutch/plugin/PluginManifestParser.java
+++ b/src/java/org/apache/nutch/plugin/PluginManifestParser.java
@@ -29,9 +29,9 @@ import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
 
-import org.slf4j.Logger;
-
 import org.apache.hadoop.conf.Configuration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.Node;
@@ -39,8 +39,9 @@ import org.w3c.dom.NodeList;
 import org.xml.sax.SAXException;
 
 /**
- * The <code>PluginManifestParser</code> parser just parse the manifest file in
- * all plugin directories.
+ * The <code>PluginManifestParser</code> provides a mechanism for
+ * parsing Nutch plugin manifest files (<code>plugin.xml</code>) contained
+ * in a {@link java.lang.String[]} of plugin directories.
  * 
  * @author joa23
  */
@@ -49,17 +50,15 @@ public class PluginManifestParser {
   private static final String ATTR_CLASS = "class";
   private static final String ATTR_ID = "id";
 
-  public static final Logger LOG = PluginRepository.LOG;
+  protected static final Logger LOG = 
LoggerFactory.getLogger(PluginManifestParser.class);
 
-  private static final boolean WINDOWS = System.getProperty("os.name")
-      .startsWith("Windows");
+  private static final boolean WINDOWS = 
System.getProperty("os.name").startsWith("Windows");
 
   private Configuration conf;
 
   private PluginRepository pluginRepository;
 
-  public PluginManifestParser(Configuration conf,
-      PluginRepository pluginRepository) {
+  public PluginManifestParser(Configuration conf, PluginRepository 
pluginRepository) {
     this.conf = conf;
     this.pluginRepository = pluginRepository;
   }
@@ -83,18 +82,17 @@ public class PluginManifestParser {
       if (directory == null) {
         continue;
       }
-      LOG.info("Plugins: looking in: " + directory.getAbsolutePath());
+      LOG.info("Plugins: looking in: {}", directory.getAbsolutePath());
       for (File oneSubFolder : directory.listFiles()) {
         if (oneSubFolder.isDirectory()) {
           String manifestPath = oneSubFolder.getAbsolutePath() + File.separator
-              + "plugin.xml";
+                  + "plugin.xml";
           try {
-            LOG.debug("parsing: " + manifestPath);
+            LOG.debug("Parsing: {}", manifestPath);
             PluginDescriptor p = parseManifestFile(manifestPath);
             map.put(p.getPluginId(), p);
           } catch (Exception e) {
-            LOG.warn("Error while loading plugin `" + manifestPath + "` "
-                + e.toString());
+            LOG.warn("Error while loading plugin {}: {}", manifestPath, 
e.toString());
           }
         }
       }
@@ -113,13 +111,13 @@ public class PluginManifestParser {
     if (!directory.isAbsolute()) {
       URL url = PluginManifestParser.class.getClassLoader().getResource(name);
       if (url == null && directory.exists() && directory.isDirectory()
-          && directory.listFiles().length > 0) {
+              && directory.listFiles().length > 0) {
         return directory; // relative path that is not in the classpath
       } else if (url == null) {
-        LOG.warn("Plugins: directory not found: " + name);
+        LOG.warn("Plugins: directory not found: {}", name);
         return null;
       } else if (!"file".equals(url.getProtocol())) {
-        LOG.warn("Plugins: not a file: url. Can't load plugins from: " + url);
+        LOG.warn("Plugins: not a file: url. Can't load plugins from: {}", url);
         return null;
       }
       String path = url.getPath();
@@ -131,7 +129,7 @@ public class PluginManifestParser {
       }
       directory = new File(path);
     } else if (!directory.exists()) {
-      LOG.warn("Plugins: directory not found: " + name);
+      LOG.warn("Plugins: directory not found: {}", name);
       return null;
     }
     return directory;
@@ -145,8 +143,8 @@ public class PluginManifestParser {
    * @throws MalformedURLException
    */
   private PluginDescriptor parseManifestFile(String pManifestPath)
-      throws MalformedURLException, SAXException, IOException,
-      ParserConfigurationException {
+          throws MalformedURLException, SAXException, IOException,
+          ParserConfigurationException {
     Document document = parseXML(new File(pManifestPath).toURI().toURL());
     String pPath = new File(pManifestPath).getParent();
     return parsePlugin(document, pPath);
@@ -160,8 +158,8 @@ public class PluginManifestParser {
    * @throws ParserConfigurationException
    * @throws DocumentException
    */
-  private Document parseXML(URL url) throws SAXException, IOException,
-      ParserConfigurationException {
+  private Document parseXML(URL url)
+          throws SAXException, IOException, ParserConfigurationException {
     DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
     DocumentBuilder builder = factory.newDocumentBuilder();
     return builder.parse(url.openStream());
@@ -172,7 +170,7 @@ public class PluginManifestParser {
    * @throws MalformedURLException
    */
   private PluginDescriptor parsePlugin(Document pDocument, String pPath)
-      throws MalformedURLException {
+          throws MalformedURLException {
     Element rootElement = pDocument.getDocumentElement();
     String id = rootElement.getAttribute(ATTR_ID);
     String name = rootElement.getAttribute(ATTR_NAME);
@@ -183,9 +181,9 @@ public class PluginManifestParser {
       pluginClazz = rootElement.getAttribute(ATTR_CLASS);
     }
     PluginDescriptor pluginDescriptor = new PluginDescriptor(id, version, name,
-        providerName, pluginClazz, pPath, this.conf);
-    LOG.debug("plugin: id=" + id + " name=" + name + " version=" + version
-        + " provider=" + providerName + "class=" + pluginClazz);
+            providerName, pluginClazz, pPath, this.conf);
+    LOG.debug("plugin: id={} name={} version={} provider={} class={}", 
+            id, name, version, providerName, pluginClazz);
     parseExtension(rootElement, pluginDescriptor);
     parseExtensionPoints(rootElement, pluginDescriptor);
     parseLibraries(rootElement, pluginDescriptor);
@@ -199,7 +197,7 @@ public class PluginManifestParser {
    * @throws MalformedURLException
    */
   private void parseRequires(Element pRootElement, PluginDescriptor 
pDescriptor)
-      throws MalformedURLException {
+          throws MalformedURLException {
 
     NodeList nodelist = pRootElement.getElementsByTagName("requires");
     if (nodelist.getLength() > 0) {
@@ -222,8 +220,8 @@ public class PluginManifestParser {
    * @param pDescriptor
    * @throws MalformedURLException
    */
-  private void parseLibraries(Element pRootElement, PluginDescriptor 
pDescriptor)
-      throws MalformedURLException {
+  private void parseLibraries(Element pRootElement,
+          PluginDescriptor pDescriptor) throws MalformedURLException {
     NodeList nodelist = pRootElement.getElementsByTagName("runtime");
     if (nodelist.getLength() > 0) {
 
@@ -248,7 +246,7 @@ public class PluginManifestParser {
    * @param pluginDescriptor
    */
   private void parseExtensionPoints(Element pRootElement,
-      PluginDescriptor pPluginDescriptor) {
+          PluginDescriptor pPluginDescriptor) {
     NodeList list = pRootElement.getElementsByTagName("extension-point");
     if (list != null) {
       for (int i = 0; i < list.getLength(); i++) {
@@ -267,7 +265,7 @@ public class PluginManifestParser {
    * @param pluginDescriptor
    */
   private void parseExtension(Element pRootElement,
-      PluginDescriptor pPluginDescriptor) {
+          PluginDescriptor pPluginDescriptor) {
     NodeList extensions = pRootElement.getElementsByTagName("extension");
     if (extensions != null) {
       for (int i = 0; i < extensions.getLength(); i++) {
@@ -286,14 +284,14 @@ public class PluginManifestParser {
             String extensionClass = oneImplementation.getAttribute(ATTR_CLASS);
             LOG.debug("impl: point=" + pointId + " class=" + extensionClass);
             Extension extension = new Extension(pPluginDescriptor, pointId, id,
-                extensionClass, this.conf, this.pluginRepository);
+                    extensionClass, this.conf, this.pluginRepository);
             NodeList parameters = oneImplementation
-                .getElementsByTagName("parameter");
+                    .getElementsByTagName("parameter");
             if (parameters != null) {
               for (int k = 0; k < parameters.getLength(); k++) {
                 Element param = (Element) parameters.item(k);
                 extension.addAttribute(param.getAttribute(ATTR_NAME),
-                    param.getAttribute("value"));
+                        param.getAttribute("value"));
               }
             }
             pPluginDescriptor.addExtension(extension);
diff --git a/src/java/org/apache/nutch/plugin/PluginRepository.java 
b/src/java/org/apache/nutch/plugin/PluginRepository.java
index 44df3a2..726da45 100644
--- a/src/java/org/apache/nutch/plugin/PluginRepository.java
+++ b/src/java/org/apache/nutch/plugin/PluginRepository.java
@@ -21,30 +21,39 @@ import java.lang.reflect.Array;
 import java.lang.reflect.Constructor;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
+import java.net.URLStreamHandler;
+import java.net.URLStreamHandlerFactory;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
-import java.util.WeakHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.WeakHashMap;
 import java.util.regex.Pattern;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.ObjectCache;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
- * The plugin repositority is a registry of all plugins.
+ * <p>The plugin repositority is a registry of all plugins.</p>
  * 
- * At system boot up a repositority is built by parsing the mainifest files of
+ * <p>At system boot up a repositority is built by parsing the mainifest files 
of
  * all plugins. Plugins that require other plugins which do not exist are not
  * registed. For each plugin a plugin descriptor instance will be created. The
  * descriptor represents all meta information about a plugin. So a plugin
  * instance will be created later when it is required, this allow lazy plugin
- * loading.
+ * loading.</p>
+ *
+ * <p>As protocol-plugins need to be registered with the JVM as well, this 
class
+ * also acts as an {@link java.net.URLStreamHandlerFactory} that registers with
+ * the JVM and supports all the new protocols as if they were native. Details 
of
+ * how the JVM creates URLs can be seen in the API documentation for the
+ * <a 
href="https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/net/URL.html#%3Cinit%3E(java.lang.String,java.lang.String,int,java.lang.String)">URL
 constructor</a>.</p>
  */
-public class PluginRepository {
+public class PluginRepository implements URLStreamHandlerFactory {
   private static final WeakHashMap<String, PluginRepository> CACHE = new 
WeakHashMap<>();
 
   private boolean auto;
@@ -59,8 +68,7 @@ public class PluginRepository {
 
   private Configuration conf;
 
-  protected static final Logger LOG = LoggerFactory
-      .getLogger(MethodHandles.lookup().lookupClass());
+  protected static final Logger LOG = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
   /**
    * @param conf a populated {@link Configuration}
@@ -73,26 +81,29 @@ public class PluginRepository {
     this.auto = conf.getBoolean("plugin.auto-activation", true);
     String[] pluginFolders = conf.getStrings("plugin.folders");
     PluginManifestParser manifestParser = new PluginManifestParser(this.conf,
-        this);
+            this);
     Map<String, PluginDescriptor> allPlugins = manifestParser
-        .parsePluginFolder(pluginFolders);
+            .parsePluginFolder(pluginFolders);
     if (allPlugins.isEmpty()) {
       LOG.warn("No plugins found on paths of property plugin.folders=\"{}\"",
-          conf.get("plugin.folders"));
+              conf.get("plugin.folders"));
     }
     Pattern excludes = Pattern.compile(conf.get("plugin.excludes", ""));
     Pattern includes = Pattern.compile(conf.get("plugin.includes", ""));
     Map<String, PluginDescriptor> filteredPlugins = filter(excludes, includes,
-        allPlugins);
+            allPlugins);
     fRegisteredPlugins = getDependencyCheckedPlugins(filteredPlugins,
-        this.auto ? allPlugins : filteredPlugins);
+            this.auto ? allPlugins : filteredPlugins);
     installExtensionPoints(fRegisteredPlugins);
     try {
       installExtensions(fRegisteredPlugins);
     } catch (PluginRuntimeException e) {
-      LOG.error(e.toString());
+      LOG.error("Could not install extensions.", e.toString());
       throw new RuntimeException(e.getMessage());
     }
+
+    registerURLStreamHandlerFactory();
+
     displayStatus();
   }
 
@@ -122,7 +133,7 @@ public class PluginRepository {
     for (PluginDescriptor plugin : plugins) {
       for (ExtensionPoint point : plugin.getExtenstionPoints()) {
         String xpId = point.getId();
-        LOG.debug("Adding extension point " + xpId);
+        LOG.debug("Adding extension point {}", xpId);
         fExtensionPoints.put(xpId, point);
       }
     }
@@ -132,16 +143,15 @@ public class PluginRepository {
    * @param pRegisteredPlugins
    */
   private void installExtensions(List<PluginDescriptor> pRegisteredPlugins)
-      throws PluginRuntimeException {
+          throws PluginRuntimeException {
 
     for (PluginDescriptor descriptor : pRegisteredPlugins) {
       for (Extension extension : descriptor.getExtensions()) {
         String xpId = extension.getTargetPoint();
         ExtensionPoint point = getExtensionPoint(xpId);
         if (point == null) {
-          throw new PluginRuntimeException("Plugin ("
-              + descriptor.getPluginId() + "), " + "extension point: " + xpId
-              + " does not exist.");
+          throw new PluginRuntimeException("Plugin (" + 
descriptor.getPluginId()
+          + "), " + "extension point: " + xpId + " does not exist.");
         }
         point.addExtension(extension);
       }
@@ -149,10 +159,10 @@ public class PluginRepository {
   }
 
   private void getPluginCheckedDependencies(PluginDescriptor plugin,
-      Map<String, PluginDescriptor> plugins,
-      Map<String, PluginDescriptor> dependencies,
-      Map<String, PluginDescriptor> branch) throws MissingDependencyException,
-      CircularDependencyException {
+          Map<String, PluginDescriptor> plugins,
+          Map<String, PluginDescriptor> dependencies,
+          Map<String, PluginDescriptor> branch)
+                  throws MissingDependencyException, 
CircularDependencyException {
 
     if (dependencies == null) {
       dependencies = new HashMap<>();
@@ -166,24 +176,24 @@ public class PluginRepository {
     for (String id : plugin.getDependencies()) {
       PluginDescriptor dependency = plugins.get(id);
       if (dependency == null) {
-        throw new MissingDependencyException("Missing dependency " + id
-            + " for plugin " + plugin.getPluginId());
+        throw new MissingDependencyException(
+                "Missing dependency " + id + " for plugin " + 
plugin.getPluginId());
       }
       if (branch.containsKey(id)) {
         throw new CircularDependencyException("Circular dependency detected "
-            + id + " for plugin " + plugin.getPluginId());
+                + id + " for plugin " + plugin.getPluginId());
       }
       dependencies.put(id, dependency);
       getPluginCheckedDependencies(plugins.get(id), plugins, dependencies,
-          branch);
+              branch);
     }
 
     branch.remove(plugin.getPluginId());
   }
 
   private Map<String, PluginDescriptor> getPluginCheckedDependencies(
-      PluginDescriptor plugin, Map<String, PluginDescriptor> plugins)
-      throws MissingDependencyException, CircularDependencyException {
+          PluginDescriptor plugin, Map<String, PluginDescriptor> plugins)
+                  throws MissingDependencyException, 
CircularDependencyException {
     Map<String, PluginDescriptor> dependencies = new HashMap<>();
     Map<String, PluginDescriptor> branch = new HashMap<>();
     getPluginCheckedDependencies(plugin, plugins, dependencies, branch);
@@ -198,7 +208,8 @@ public class PluginRepository {
    * @return List
    */
   private List<PluginDescriptor> getDependencyCheckedPlugins(
-      Map<String, PluginDescriptor> filtered, Map<String, PluginDescriptor> 
all) {
+          Map<String, PluginDescriptor> filtered,
+          Map<String, PluginDescriptor> all) {
     if (filtered == null) {
       return null;
     }
@@ -209,7 +220,7 @@ public class PluginRepository {
         checked.putAll(getPluginCheckedDependencies(plugin, all));
         checked.put(plugin.getPluginId(), plugin);
       } catch (MissingDependencyException mde) {
-        // Logger exception and ignore plugin
+        // Log exception and ignore plugin
         LOG.warn(mde.getMessage());
       } catch (CircularDependencyException cde) {
         // Simply ignore this plugin
@@ -225,8 +236,8 @@ public class PluginRepository {
    * @return PluginDescriptor[]
    */
   public PluginDescriptor[] getPluginDescriptors() {
-    return fRegisteredPlugins.toArray(new PluginDescriptor[fRegisteredPlugins
-        .size()]);
+    return fRegisteredPlugins
+            .toArray(new PluginDescriptor[fRegisteredPlugins.size()]);
   }
 
   /**
@@ -255,14 +266,14 @@ public class PluginRepository {
   }
 
   /**
-   * Returns a instance of a plugin. Plugin instances are cached. So a plugin
+   * <p>Returns a instance of a plugin. Plugin instances are cached. So a 
plugin
    * exist only as one instance. This allow a central management of plugin own
-   * resources.
+   * resources.</p>
    * 
-   * After creating the plugin instance the startUp() method is invoked. The
+   * <p>After creating the plugin instance the startUp() method is invoked. The
    * plugin use a own classloader that is used as well by all instance of
    * extensions of the same plugin. This class loader use all exported 
libraries
-   * from the dependend plugins and all plugin libraries.
+   * from the dependend plugins and all plugin libraries.</p>
    * 
    * @param pDescriptor a {@link PluginDescriptor} for which to retrieve a 
    * {@link Plugin} instance
@@ -270,7 +281,7 @@ public class PluginRepository {
    * @throws PluginRuntimeException if there is a fatal runtime plugin error
    */
   public Plugin getPluginInstance(PluginDescriptor pDescriptor)
-      throws PluginRuntimeException {
+          throws PluginRuntimeException {
     if (fActivatedPlugins.containsKey(pDescriptor.getPluginId()))
       return fActivatedPlugins.get(pDescriptor.getPluginId());
     try {
@@ -280,11 +291,11 @@ public class PluginRepository {
       // Suggested by Stefan Groschupf <[email protected]>
       synchronized (pDescriptor) {
         Class<?> pluginClass = getCachedClass(pDescriptor,
-            pDescriptor.getPluginClass());
-        Constructor<?> constructor = pluginClass.getConstructor(new Class<?>[] 
{
-            PluginDescriptor.class, Configuration.class });
-        Plugin plugin = (Plugin) constructor.newInstance(new Object[] {
-            pDescriptor, this.conf });
+                pDescriptor.getPluginClass());
+        Constructor<?> constructor = pluginClass.getConstructor(
+                new Class<?>[] { PluginDescriptor.class, Configuration.class 
});
+        Plugin plugin = (Plugin) constructor
+                .newInstance(new Object[] { pDescriptor, this.conf });
         plugin.startUp();
         fActivatedPlugins.put(pDescriptor.getPluginId(), plugin);
         return plugin;
@@ -302,11 +313,13 @@ public class PluginRepository {
     }
   }
 
-  /*
-   * (non-Javadoc)
-   * 
+  /**
+   * Attempts to shut down all activated plugins.
+   * @deprecated
+   * @see <a href="https://openjdk.java.net/jeps/421";>JEP 421: Deprecate 
Finalization for Removal</a>
    * @see java.lang.Object#finalize()
    */
+  @Deprecated
   public void finalize() throws Throwable {
     shutDownActivatedPlugins();
   }
@@ -323,7 +336,7 @@ public class PluginRepository {
   }
 
   public Class getCachedClass(PluginDescriptor pDescriptor, String className)
-      throws ClassNotFoundException {
+          throws ClassNotFoundException {
     Map<PluginClassLoader, Class> descMap = CLASS_CACHE.get(className);
     if (descMap == null) {
       descMap = new HashMap<>();
@@ -339,14 +352,14 @@ public class PluginRepository {
   }
 
   private void displayStatus() {
-    LOG.info("Plugin Auto-activation mode: [" + this.auto + "]");
+    LOG.info("Plugin Auto-activation mode: [{}]", this.auto);
     LOG.info("Registered Plugins:");
 
     if ((fRegisteredPlugins == null) || (fRegisteredPlugins.size() == 0)) {
       LOG.info("\tNONE");
     } else {
       for (PluginDescriptor plugin : fRegisteredPlugins) {
-        LOG.info("\t" + plugin.getName() + " (" + plugin.getPluginId() + ")");
+        LOG.info("\t{} ({})", plugin.getName(), plugin.getPluginId());
       }
     }
 
@@ -355,7 +368,7 @@ public class PluginRepository {
       LOG.info("\tNONE");
     } else {
       for (ExtensionPoint ep : fExtensionPoints.values()) {
-        LOG.info("\t" + ep.getName() + " (" + ep.getId() + ")");
+        LOG.info("\t ({})", ep.getName(), ep.getId());
       }
     }
   }
@@ -372,7 +385,7 @@ public class PluginRepository {
    * @return map of plugins matching the configuration
    */
   private Map<String, PluginDescriptor> filter(Pattern excludes,
-      Pattern includes, Map<String, PluginDescriptor> plugins) {
+          Pattern includes, Map<String, PluginDescriptor> plugins) {
 
     Map<String, PluginDescriptor> map = new HashMap<>();
 
@@ -391,11 +404,11 @@ public class PluginRepository {
       }
 
       if (!includes.matcher(id).matches()) {
-        LOG.debug("not including: " + id);
+        LOG.debug("not including: {}", id);
         continue;
       }
       if (excludes.matcher(id).matches()) {
-        LOG.debug("excluding: " + id);
+        LOG.debug("excluding: {}", id);
         continue;
       }
       map.put(plugin.getPluginId(), plugin);
@@ -419,7 +432,7 @@ public class PluginRepository {
    * @return array of plugin instances
    */
   public synchronized Object[] getOrderedPlugins(Class<?> clazz,
-      String xPointId, String orderProperty) {
+          String xPointId, String orderProperty) {
     Object[] filters;
     ObjectCache objectCache = ObjectCache.get(conf);
     filters = (Object[]) objectCache.getObject(clazz.getName());
@@ -434,8 +447,8 @@ public class PluginRepository {
       }
 
       try {
-        ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
-            xPointId);
+        ExtensionPoint point = PluginRepository.get(conf)
+                .getExtensionPoint(xPointId);
         if (point == null)
           throw new RuntimeException(xPointId + " not found.");
         Extension[] extensions = point.getExtensions();
@@ -453,9 +466,9 @@ public class PluginRepository {
         for (String orderedFilter : orderOfFilters) {
           Object f = filterMap.get(orderedFilter);
           if (f == null) {
-            LOG.error(clazz.getSimpleName() + " : " + orderedFilter
-                + " declared in configuration property " + orderProperty
-                + " but not found in an active plugin - ignoring.");
+            LOG.error("{} : {} declared in configuration property {} "
+                    + "but not found in an active plugin - ignoring.", 
+                    clazz.getSimpleName(), orderedFilter, orderProperty);
             continue;
           }
           sorted.add(f);
@@ -464,8 +477,8 @@ public class PluginRepository {
         for (int i = 0; i < sorted.size(); i++) {
           filter[i] = sorted.get(i);
           if (LOG.isTraceEnabled()) {
-            LOG.trace(clazz.getSimpleName() + " : filters[" + i + "] = "
-                + filter[i].getClass());
+            LOG.trace("{} : filters[{}] = {}", clazz.getSimpleName() , i,
+                    filter[i].getClass());
           }
         }
         objectCache.setObject(clazz.getName(), filter);
@@ -490,8 +503,8 @@ public class PluginRepository {
    */
   public static void main(String[] args) throws Exception {
     if (args.length < 2) {
-      System.err
-          .println("Usage: PluginRepository pluginId className [arg1 arg2 
...]");
+      System.err.println(
+              "Usage: PluginRepository pluginId className [arg1 arg2 ...]");
       return;
     }
     Configuration conf = NutchConfiguration.create();
@@ -508,8 +521,8 @@ public class PluginRepository {
     try {
       clazz = Class.forName(args[1], true, cl);
     } catch (Exception e) {
-      System.err.println("Could not load the class '" + args[1] + ": "
-          + e.getMessage());
+      System.err.println(
+              "Could not load the class '" + args[1] + ": " + e.getMessage());
       return;
     }
     Method m = null;
@@ -517,11 +530,108 @@ public class PluginRepository {
       m = clazz.getMethod("main", new Class<?>[] { args.getClass() });
     } catch (Exception e) {
       System.err.println("Could not find the 'main(String[])' method in class "
-          + args[1] + ": " + e.getMessage());
+              + args[1] + ": " + e.getMessage());
       return;
     }
     String[] subargs = new String[args.length - 2];
     System.arraycopy(args, 2, subargs, 0, subargs.length);
     m.invoke(null, new Object[] { subargs });
   }
+
+  /**
+   * Registers this PluginRepository to be invoked whenever URLs have to be
+   * parsed. This allows to check the registered protocol plugins for uncommon
+   * protocols.
+   */
+  private void registerURLStreamHandlerFactory() {
+    
org.apache.nutch.plugin.URLStreamHandlerFactory.getInstance().registerPluginRepository(this);
+  }
+
+  /**
+   * <p>Invoked whenever a {@link java.net.URL} needs to be instantiated. 
Tries to find a
+   * suitable extension and allows it to provide a {@link 
java.net.URLStreamHandler}.</p> 
+   * This is done by several attempts:
+   * <ul>
+   * <li>Find a protocol plugin that implements the desired protocol. If found,
+   * instantiate it so eventually the plugin can install a {@link 
java.net.URLStreamHandler}
+   * through a static hook.</li>
+   * <li>If the plugin specifies a {@link java.net.URLStreamHandler} in its 
+   * <code>plugin.xml</code> manifest, return an instance of this 
+   * {@link java.net.URLStreamHandler}. Example:
+   * 
+   * <pre>
+   *  ...
+   *  &lt;implementation id="org.apache.nutch.protocol.foo.Foo" 
class="org.apache.nutch.protocol.foo.Foo"&gt;
+   *      &lt;parameter name="protocolName" value="foo"/&gt;
+   *      &lt;parameter name="urlStreamHandler" 
value="org.apache.nutch.protocol.foo.Handler"/&gt;
+   *  &lt;/implementation&gt;
+   *  ...
+   * </pre>
+   * </li>
+   * <li>If all else fails, return null. This will fallback to the JVM's method
+   * of evaluating the system property 
<code>java.protocol.handler.pkgs</code>.</li>
+   * </ul>
+   * 
+   * @return the URLStreamHandler found, or null.
+   * @see java.net.URL
+   * @see <a 
href="https://issues.apache.org/jira/browse/NUTCH-2429";>NUTCH-2429</a>
+   */
+  public URLStreamHandler createURLStreamHandler(String protocol) {
+    LOG.debug("Creating URLStreamHandler for protocol: {}", protocol);
+
+    if (fExtensionPoints != null) {
+      ExtensionPoint ep = fExtensionPoints
+              .get("org.apache.nutch.protocol.Protocol");
+      if (ep != null) {
+        Extension[] extensions = ep.getExtensions();
+        for (Extension extension : extensions) {
+          String p = extension.getAttribute("protocolName");
+          if (p.equals(protocol)) {
+            LOG.debug("Suitable protocolName attribute located: {}", p);
+
+            // instantiate the plugin. This allows it to execute a static hook,
+            // if present. Extensions and PluginInstances are cached already, 
so we
+            // should not create too many instances
+            Object extinst = null;
+            try {
+              extinst = extension.getExtensionInstance();
+              LOG.debug("Located extension instance class: {}", 
extinst.getClass().getName());
+            } catch (Exception e) {
+              LOG.warn("Could not find {}", extension.getId(), e);
+            }
+
+            // return the handler here, if possible
+            String handlerClass = extension.getAttribute("urlStreamHandler");
+            LOG.debug("Located URLStreamHandler: {}", handlerClass);
+            if (handlerClass != null) {
+              // the nutch classloader
+              ClassLoader cl = this.getClass().getClassLoader();
+              if (extinst != null) {
+                // the extension's classloader
+                cl = extinst.getClass().getClassLoader();
+              }
+
+              try {
+                // instantiate the handler and return it
+                Class<?> clazz = cl.loadClass(handlerClass);
+                return (URLStreamHandler) 
clazz.getDeclaredConstructor().newInstance();
+              } catch (Exception e) {
+                LOG.error("Could not instantiate protocol {} handler class {} 
defined by extension {}", 
+                        protocol, handlerClass, extension.getId(), e);
+                return null;
+              }
+            }
+
+            LOG.debug("suitable protocol extension found that did not declare 
a handler");
+            return null;
+          }
+        }
+        LOG.debug("No suitable protocol extensions registered");
+      } else {
+        LOG.debug("No protocol extensions registered?");
+      }
+    }
+
+    return null;
+  }
 }
diff --git a/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java 
b/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java
new file mode 100644
index 0000000..a64454c
--- /dev/null
+++ b/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+import java.lang.ref.WeakReference;
+import java.net.URL;
+import java.net.URLStreamHandler;
+import java.util.ArrayList;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This URLStreamHandlerFactory knows about all the plugins
+ * in use and thus can create the correct URLStreamHandler
+ * even if it comes from a plugin classpath.
+ * As the JVM allows only one instance of URLStreamHandlerFactory
+ * to be registered, this class implements a singleton pattern.
+ * @author Hiran Chaudhuri
+ *
+ */
+public class URLStreamHandlerFactory
+    implements java.net.URLStreamHandlerFactory {
+  
+  protected static final Logger LOG = LoggerFactory
+      .getLogger(URLStreamHandlerFactory.class);
+  
+  /** The singleton instance. */
+  private static URLStreamHandlerFactory instance;
+  
+  /** Here we register all PluginRepositories.
+   * In this class we do not know why several instances of PluginRepository
+   * are kept, nor do we know how long they will be used. To prevent
+   * a memory leak, this class must not keep references to PluginRepository
+   * but use WeakReference which allows PluginRepository to still be
+   * garbage collected. The prize is we need to clean the list for
+   * outdated references which is done in the {@link #removeInvalidRefs()} 
method.
+   */
+  private ArrayList<WeakReference<PluginRepository>> prs;
+  
+  static {
+    instance = new URLStreamHandlerFactory();
+    URL.setURLStreamHandlerFactory(instance);
+    LOG.debug("Registered URLStreamHandlerFactory with the JVM.");
+  }
+  
+  private URLStreamHandlerFactory() {
+    prs = new ArrayList<>();
+  }
+
+  /** 
+   * Get the singleton instance of this class.
+   * @return a {@link org.apache.nutch.plugin.URLStreamHandlerFactory} 
instance 
+   */
+  public static URLStreamHandlerFactory getInstance() {
+    return instance;
+  }
+  
+  /** Use this method once a new PluginRepository was created to register it.
+   * 
+   * @param pr The PluginRepository to be registered.
+   */
+  public void registerPluginRepository(PluginRepository pr) {
+    prs.add(new WeakReference<PluginRepository>(pr));
+    
+    removeInvalidRefs();
+  }
+
+  @Override
+  public URLStreamHandler createURLStreamHandler(String protocol) {
+    LOG.debug("Creating URLStreamHandler for protocol: {}", protocol);
+    
+    removeInvalidRefs();
+    
+    // find the 'correct' PluginRepository. For now we simply take the first.
+    // then ask it to return the URLStreamHandler
+    for(WeakReference<PluginRepository> ref: prs) {
+      PluginRepository pr = ref.get();
+      if(pr != null) {
+        // found PluginRepository. Let's get the URLStreamHandler...
+        return pr.createURLStreamHandler(protocol);
+      }
+    }
+    return null;
+  }
+
+  /** Maintains the list of PluginRepositories by
+   * removing the references whose referents have been
+   * garbage collected meanwhile.
+   */
+  private void removeInvalidRefs() {
+    LOG.debug("removeInvalidRefs()");
+    ArrayList<WeakReference<PluginRepository>> copy = new ArrayList<>(prs);
+    for(WeakReference<PluginRepository> ref: copy) {
+      if(ref.get() == null) {
+        prs.remove(ref);
+      }
+    }
+    LOG.debug("Removed the following invalid references: '{}' Remaining: 
'{}'", copy.size()-prs.size(), prs.size());
+  }
+}
diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java 
b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
index 8a23fbf..621484c 100644
--- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java
+++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
@@ -16,40 +16,37 @@
  */
 package org.apache.nutch.util;
 
-import java.io.File;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
 import java.net.URL;
 import java.text.SimpleDateFormat;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.MissingOptionException;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.TimingUtil;
-import org.apache.nutch.util.URLUtil;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.CommandLineParser;
-import org.apache.commons.cli.GnuParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.OptionBuilder;
-import org.apache.commons.cli.Options;
-import org.apache.commons.cli.MissingOptionException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Extracts some simple crawl completion stats from the crawldb
@@ -204,7 +201,14 @@ public class CrawlCompletionStats extends Configured 
implements Tool {
     public void map(Text urlText, CrawlDatum datum, Context context)
         throws IOException, InterruptedException {
 
-      URL url = new URL(urlText.toString());
+      URL url;
+      try {
+        url = new URL(urlText.toString());
+      } catch (MalformedURLException e) {
+        LOG.error("Failed to get host or domain from URL {}: {}",
+            urlText, e.getMessage());
+        return;
+      }
       String out = "";
       switch (mode) {
         case MODE_HOST:
diff --git a/src/java/org/apache/nutch/util/NutchJob.java 
b/src/java/org/apache/nutch/util/NutchJob.java
index 04b38df..3e852eb 100644
--- a/src/java/org/apache/nutch/util/NutchJob.java
+++ b/src/java/org/apache/nutch/util/NutchJob.java
@@ -19,12 +19,13 @@ package org.apache.nutch.util;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.mapreduce.Job;
+import org.apache.nutch.plugin.PluginRepository;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /** A {@link Job} for Nutch jobs. */
 public class NutchJob extends Job {
@@ -35,6 +36,11 @@ public class NutchJob extends Job {
   @SuppressWarnings("deprecation")
   public NutchJob(Configuration conf, String jobName) throws IOException {
     super(conf, jobName);
+    if (conf != null) {
+      // initialize plugins early to register URL stream handlers to support
+      // custom protocol implementations
+      PluginRepository.get(conf);
+    }
   }
 
   public static Job getInstance(Configuration conf) throws IOException {
diff --git a/src/java/org/apache/nutch/util/NutchTool.java 
b/src/java/org/apache/nutch/util/NutchTool.java
index 244ae99..a6a1cee 100644
--- a/src/java/org/apache/nutch/util/NutchTool.java
+++ b/src/java/org/apache/nutch/util/NutchTool.java
@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.plugin.PluginRepository;
 
 public abstract class NutchTool extends Configured {
 
@@ -53,6 +54,14 @@ public abstract class NutchTool extends Configured {
   public NutchTool(){
     super(null);
   }
+  
+  @Override
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if(conf != null) {
+      PluginRepository.get(conf);
+    }
+  }
 
   /**
    * Get relative progress of the tool. Progress is represented as a
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java 
b/src/java/org/apache/nutch/util/SitemapProcessor.java
index b191f23..1a1955e 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -41,7 +41,6 @@ import 
org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
-
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.hostdb.HostDatum;
 import org.apache.nutch.net.URLFilters;
@@ -51,7 +50,6 @@ import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -82,13 +80,11 @@ import crawlercommons.sitemaps.SiteMapURL;
  * the sitemaps into the CrawlDb.</li>
  * </ol>
  *
- * <p>
- * For more details see:
- * https://cwiki.apache.org/confluence/display/NUTCH/SitemapFeature
- * </p>
+ * @see
+ * <a 
href="https://cwiki.apache.org/confluence/display/NUTCH/SitemapFeature";>SitemapFeature</a>
  */
 public class SitemapProcessor extends Configured implements Tool {
-  public static final Logger LOG = 
LoggerFactory.getLogger(SitemapProcessor.class);
+  private static final Logger LOG = 
LoggerFactory.getLogger(SitemapProcessor.class);
   public static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd 
HH:mm:ss");
 
   public static final String CURRENT_NAME = "current";
diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java 
b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
index 24e7a1c..0d789ed 100644
--- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
+++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
@@ -16,31 +16,32 @@
  */
 package org.apache.nutch.util.domain;
 
-import java.io.File;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
 import java.net.URL;
 import java.text.SimpleDateFormat;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.TimingUtil;
 import org.apache.nutch.util.URLUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Extracts some very basic statistics about domains from the crawldb
@@ -175,7 +176,14 @@ public class DomainStatistics extends Configured 
implements Tool {
           || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
 
         try {
-          URL url = new URL(urlText.toString());
+          URL url;
+          try {
+            url = new URL(urlText.toString());
+          } catch (MalformedURLException e) {
+            LOG.error("Failed to get host or domain from URL {}: {}",
+                urlText, e.getMessage());
+            return;
+          }
           String out = null;
           switch (mode) {
           case MODE_HOST:
diff --git 
a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java 
b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
index e56aaa6..c0f1d6f 100644
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
+++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
@@ -41,7 +41,7 @@ import org.slf4j.LoggerFactory;
 public class Any23IndexingFilter implements IndexingFilter {
 
   /** Logging instance */
-  public static final Logger LOG = 
LoggerFactory.getLogger(Any23IndexingFilter.class);
+  private static final Logger LOG = 
LoggerFactory.getLogger(Any23IndexingFilter.class);
   
   public static final String STRUCTURED_DATA = "structured_data";
 
diff --git 
a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java 
b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
index d9f0896..af7f135 100644
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
+++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
@@ -58,7 +58,7 @@ import org.w3c.dom.DocumentFragment;
 public class Any23ParseFilter implements HtmlParseFilter {
 
   /** Logging instance */
-  public static final Logger LOG = 
LoggerFactory.getLogger(Any23ParseFilter.class);
+  private static final Logger LOG = 
LoggerFactory.getLogger(Any23ParseFilter.class);
 
   private Configuration conf = null;
 
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 95d7a16..7378096 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -71,6 +71,7 @@
     <ant dir="parsefilter-naivebayes" target="deploy"/>
     <ant dir="parsefilter-regex" target="deploy"/>
     <ant dir="protocol-file" target="deploy"/>
+    <ant dir="protocol-foo" target="deploy" />
     <ant dir="protocol-ftp" target="deploy"/>
     <ant dir="protocol-htmlunit" target="deploy" />
     <ant dir="protocol-http" target="deploy"/>
@@ -219,6 +220,7 @@
     <ant dir="parsefilter-naivebayes" target="clean" />
     <ant dir="parsefilter-regex" target="clean"/>
     <ant dir="protocol-file" target="clean"/>
+    <ant dir="protocol-foo" target="clean" />
     <ant dir="protocol-ftp" target="clean"/>
     <ant dir="protocol-htmlunit" target="clean" />
     <ant dir="protocol-http" target="clean"/>
diff --git 
a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
 
b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
index 6989feb..58e8993 100644
--- 
a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
+++ 
b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
@@ -52,7 +52,7 @@ import org.slf4j.LoggerFactory;
  */
 public class CSVIndexWriter implements IndexWriter {
 
-  public static final Logger LOG = LoggerFactory
+  private static final Logger LOG = LoggerFactory
       .getLogger(CSVIndexWriter.class);
 
   private Configuration config;
diff --git 
a/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java
 
b/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java
index f7a47e2..37acf12 100644
--- 
a/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java
+++ 
b/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java
@@ -40,7 +40,7 @@ import java.util.regex.Pattern;
 
 public class RabbitIndexWriter implements IndexWriter {
 
-  public static final Logger LOG = LoggerFactory
+  private static final Logger LOG = LoggerFactory
       .getLogger(RabbitIndexWriter.class);
 
   private String uri;
diff --git a/src/plugin/protocol-foo/build.xml 
b/src/plugin/protocol-foo/build.xml
new file mode 100755
index 0000000..240f448
--- /dev/null
+++ b/src/plugin/protocol-foo/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-foo" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>
diff --git a/src/plugin/protocol-foo/ivy.xml b/src/plugin/protocol-foo/ivy.xml
new file mode 100755
index 0000000..1a86d68
--- /dev/null
+++ b/src/plugin/protocol-foo/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>
diff --git a/src/plugin/protocol-foo/plugin.xml 
b/src/plugin/protocol-foo/plugin.xml
new file mode 100755
index 0000000..850afe3
--- /dev/null
+++ b/src/plugin/protocol-foo/plugin.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<plugin
+   id="protocol-foo"
+   name="Foo Protocol Example Plug-in"
+   version="1.0.0"
+   provider-name="Hiran Chaudhuri">
+
+   <runtime>
+      <library name="protocol-foo.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.foo"
+              name="FooProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.foo.Foo"
+                      class="org.apache.nutch.protocol.foo.Foo">
+        <parameter name="protocolName" value="foo"/>
+               <parameter name="urlStreamHandler" 
value="org.apache.nutch.protocol.foo.Handler"/>
+      </implementation>
+      
+   </extension>
+
+</plugin>
diff --git 
a/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Foo.java 
b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Foo.java
new file mode 100755
index 0000000..0f56f23
--- /dev/null
+++ b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Foo.java
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.foo;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.protocol.RobotRulesParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import crawlercommons.robots.BaseRobotRules;
+
+public class Foo implements Protocol {
+  protected static final Logger LOG = LoggerFactory.getLogger(Foo.class);
+
+  private Configuration conf;
+
+  @Override
+  public Configuration getConf() {
+    LOG.debug("getConf()");
+    return this.conf;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  /**
+   * This is a dummy implementation only. So what we will do is return this
+   * structure:
+   * 
+   * <pre>
+   * foo://example.com - will contain one directory and one file
+   * foo://example.com/a - directory, will contain two files
+   * foo://example.com/a/aa.txt - text file
+   * foo://example.com/a/ab.txt - text file
+   * foo://example.com/a.txt - text file
+   * </pre>
+   */
+  @Override
+  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
+    LOG.debug("getProtocolOutput({}, {})", url, datum);
+
+    try {
+      String urlstr = String.valueOf(url);
+      URL u = new URL(urlstr);
+      URL base = new URL(u, ".");
+      byte[] bytes = new byte[0];
+      String contentType = "foo/something";
+      ProtocolStatus status = ProtocolStatus.STATUS_GONE;
+
+      switch (urlstr) {
+      case "foo://example.com":
+      case "foo://example.com/": {
+        String time = HttpDateFormat.toString(System.currentTimeMillis());
+        contentType = "text/html";
+        StringBuffer sb = new StringBuffer();
+        sb.append("<html><head>");
+        sb.append("<title>Index of /</title></head>\n");
+        sb.append("<body><h1>Index of /</h1><pre>\n");
+        sb.append("<a href='a/" + "'>a/</a>\t"+ time + "\t-\n"); // add 
directory
+        sb.append("<a href='a.txt'>a.txt</a>\t" + time + "\t" + 0 + "\n"); // 
add file
+        sb.append("</pre></html></body>");
+        bytes = sb.toString().getBytes();
+        status = ProtocolStatus.STATUS_SUCCESS;
+        break;
+      }
+      case "foo://example.com/a/": {
+        String time = HttpDateFormat.toString(System.currentTimeMillis());
+        contentType = "text/html";
+        StringBuffer sb = new StringBuffer();
+        sb.append("<html><head>");
+        sb.append("<title>Index of /a/</title></head>\n");
+        sb.append("<body><h1>Index of /a/</h1><pre>\n");
+        sb.append("<a href='aa.txt'>aa.txt</a>\t" + time + "\t" + 0 + "\n"); 
// add file
+        sb.append("<a href='ab.txt'>ab.txt</a>\t" + time + "\t" + 0 + "\n"); 
// add file
+        sb.append("</pre></html></body>");
+        bytes = sb.toString().getBytes();
+        status = ProtocolStatus.STATUS_SUCCESS;
+        break;
+      }
+      case "foo://example.com/a.txt":
+      case "foo://example.com/a/aa.txt":
+      case "foo://example.com/a/ab.txt": {
+        contentType = "text/plain";
+        bytes = "In publishing and graphic design, lorem ipsum is a filler 
text or greeking commonly used to demonstrate the textual elements of a graphic 
document or visual presentation. Replacing meaningful content with placeholder 
text allows designers to design the form of the content before the content 
itself has been produced.".getBytes();
+        status = ProtocolStatus.STATUS_SUCCESS;
+        break;
+      }
+      default:
+        LOG.warn("Unknown url '{}'. This dummy implementation only supports 
'foo://example.com'", url);
+        // all our default values are set for URLs that do not exist.
+        break;
+      }
+
+      Metadata metadata = new Metadata();
+      Content content = new Content(String.valueOf(url), String.valueOf(base),
+          bytes, contentType, metadata, getConf());
+
+      return new ProtocolOutput(content, status);
+    } catch (MalformedURLException mue) {
+      LOG.error("Could not retrieve {}", url);
+      LOG.error("", mue);
+      // claim STATUS_GONE to tell nutch to never ever re-request this URL
+      return new ProtocolOutput(null, ProtocolStatus.STATUS_GONE);
+    }
+  }
+
+  @Override
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
+      List<Content> robotsTxtContent) {
+    LOG.debug("getRobotRules({}, {}, {})", url, datum, robotsTxtContent);
+    return RobotRulesParser.EMPTY_RULES;
+  }
+}
diff --git 
a/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Handler.java 
b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Handler.java
new file mode 100644
index 0000000..27f1837
--- /dev/null
+++ 
b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Handler.java
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.foo;
+
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.URLStreamHandler;
+
+public class Handler extends URLStreamHandler {
+
+  protected URLConnection openConnection(URL u) {
+    throw new UnsupportedOperationException("not yet implemented");
+  }
+}

[nutch] branch master updated: NUTCH-2429 Fix Plugin System to allow protocol plugins to bundle their URLStreamHandlers (#720)

Reply via email to