This is an automated email from the ASF dual-hosted git repository. lewismc pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new e76d69f NUTCH-2429 Fix Plugin System to allow protocol plugins to bundle their URLStreamHandlers (#720) e76d69f is described below commit e76d69fe13902fd2f3a98660dd2bac52c2ea568c Author: Lewis John McGibbney <lewis.mcgibb...@gmail.com> AuthorDate: Fri Jan 7 20:07:54 2022 -0800 NUTCH-2429 Fix Plugin System to allow protocol plugins to bundle their URLStreamHandlers (#720) * NUTCH-2429 Fix Plugin System to allow protocol plugins to bundle their URLStreamHandlers Co-authored-by: Hiran Chaudhuri <hiran.chaudh...@mail.de> --- build.xml | 1 + src/java/org/apache/nutch/crawl/CrawlDbReader.java | 43 ++-- src/java/org/apache/nutch/parse/ParserChecker.java | 5 + .../apache/nutch/plugin/PluginManifestParser.java | 66 +++--- .../org/apache/nutch/plugin/PluginRepository.java | 244 +++++++++++++++------ .../nutch/plugin/URLStreamHandlerFactory.java | 115 ++++++++++ .../apache/nutch/util/CrawlCompletionStats.java | 40 ++-- src/java/org/apache/nutch/util/NutchJob.java | 12 +- src/java/org/apache/nutch/util/NutchTool.java | 9 + .../org/apache/nutch/util/SitemapProcessor.java | 10 +- .../apache/nutch/util/domain/DomainStatistics.java | 20 +- .../apache/nutch/any23/Any23IndexingFilter.java | 2 +- .../org/apache/nutch/any23/Any23ParseFilter.java | 2 +- src/plugin/build.xml | 2 + .../nutch/indexwriter/csv/CSVIndexWriter.java | 2 +- .../indexwriter/rabbit/RabbitIndexWriter.java | 2 +- src/plugin/protocol-foo/build.xml | 22 ++ src/plugin/protocol-foo/ivy.xml | 41 ++++ src/plugin/protocol-foo/plugin.xml | 48 ++++ .../java/org/apache/nutch/protocol/foo/Foo.java | 141 ++++++++++++ .../org/apache/nutch/protocol/foo/Handler.java | 28 +++ 21 files changed, 696 insertions(+), 159 deletions(-) diff --git a/build.xml b/build.xml index ecef1e7..2c0eef0 100644 --- a/build.xml +++ b/build.xml @@ -1272,6 +1272,7 @@ <source path="${plugins.dir}/parsefilter-regex/src/test/" /> <source path="${plugins.dir}/protocol-file/src/java/" /> <source path="${plugins.dir}/protocol-file/src/test/" /> + <source path="${plugins.dir}/protocol-foo/src/java/" /> <source path="${plugins.dir}/protocol-ftp/src/java/" /> <source path="${plugins.dir}/protocol-htmlunit/src/java/" /> <source path="${plugins.dir}/protocol-http/src/java/" /> diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java index 2a20a56..f31210a 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java @@ -16,11 +16,12 @@ */ package org.apache.nutch.crawl; +import java.io.Closeable; import java.io.DataOutputStream; import java.io.File; import java.io.IOException; -import java.io.Closeable; import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; import java.net.URL; import java.nio.ByteBuffer; import java.util.ArrayList; @@ -32,16 +33,11 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Random; +import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.TreeMap; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.tdunning.math.stats.MergingDigest; -import com.tdunning.math.stats.TDigest; +import org.apache.commons.jexl3.JexlScript; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -55,18 +51,18 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; -import org.apache.hadoop.mapreduce.RecordWriter; -import org.apache.hadoop.mapreduce.TaskAttemptContext; -import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.util.AbstractChecker; import org.apache.nutch.util.JexlUtil; import org.apache.nutch.util.NutchConfiguration; @@ -74,7 +70,8 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.SegmentReaderUtil; import org.apache.nutch.util.StringUtil; import org.apache.nutch.util.TimingUtil; -import org.apache.commons.jexl3.JexlScript; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.fasterxml.jackson.core.JsonGenerationException; import com.fasterxml.jackson.core.JsonGenerator; @@ -84,6 +81,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectWriter; import com.fasterxml.jackson.databind.SerializerProvider; import com.fasterxml.jackson.databind.module.SimpleModule; +import com.tdunning.math.stats.MergingDigest; +import com.tdunning.math.stats.TDigest; /** * Read utility for the CrawlDB. @@ -375,10 +374,14 @@ public class CrawlDbReader extends AbstractChecker implements Closeable { context.write(new Text("fit"), fetchInterval); if (sort) { - URL u = new URL(key.toString()); - String host = u.getHost(); - context.write(new Text("status " + value.getStatus() + " " + host), - COUNT_1); + try { + URL u = new URL(key.toString()); + String host = u.getHost(); + context.write(new Text("status " + value.getStatus() + " " + host), + COUNT_1); + } catch (MalformedURLException e) { + LOG.error("Failed to get host from URL {}: {}", key.toString(), e.getMessage()); + } } } } diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java index 7b0e76a..6c82a51 100644 --- a/src/java/org/apache/nutch/parse/ParserChecker.java +++ b/src/java/org/apache/nutch/parse/ParserChecker.java @@ -28,6 +28,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.SignatureFactory; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.plugin.PluginRepository; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.ProtocolOutput; import org.apache.nutch.scoring.ScoringFilters; @@ -106,6 +107,10 @@ public class ParserChecker extends AbstractChecker { System.exit(-1); } + // initialize plugins early to register URL stream handlers to support + // custom protocol implementations + PluginRepository.get(getConf()); + int numConsumed; for (int i = 0; i < args.length; i++) { if (args[i].equals("-normalize")) { diff --git a/src/java/org/apache/nutch/plugin/PluginManifestParser.java b/src/java/org/apache/nutch/plugin/PluginManifestParser.java index d7280ad..4c845b4 100644 --- a/src/java/org/apache/nutch/plugin/PluginManifestParser.java +++ b/src/java/org/apache/nutch/plugin/PluginManifestParser.java @@ -29,9 +29,9 @@ import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; -import org.slf4j.Logger; - import org.apache.hadoop.conf.Configuration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; @@ -39,8 +39,9 @@ import org.w3c.dom.NodeList; import org.xml.sax.SAXException; /** - * The <code>PluginManifestParser</code> parser just parse the manifest file in - * all plugin directories. + * The <code>PluginManifestParser</code> provides a mechanism for + * parsing Nutch plugin manifest files (<code>plugin.xml</code>) contained + * in a {@link java.lang.String[]} of plugin directories. * * @author joa23 */ @@ -49,17 +50,15 @@ public class PluginManifestParser { private static final String ATTR_CLASS = "class"; private static final String ATTR_ID = "id"; - public static final Logger LOG = PluginRepository.LOG; + protected static final Logger LOG = LoggerFactory.getLogger(PluginManifestParser.class); - private static final boolean WINDOWS = System.getProperty("os.name") - .startsWith("Windows"); + private static final boolean WINDOWS = System.getProperty("os.name").startsWith("Windows"); private Configuration conf; private PluginRepository pluginRepository; - public PluginManifestParser(Configuration conf, - PluginRepository pluginRepository) { + public PluginManifestParser(Configuration conf, PluginRepository pluginRepository) { this.conf = conf; this.pluginRepository = pluginRepository; } @@ -83,18 +82,17 @@ public class PluginManifestParser { if (directory == null) { continue; } - LOG.info("Plugins: looking in: " + directory.getAbsolutePath()); + LOG.info("Plugins: looking in: {}", directory.getAbsolutePath()); for (File oneSubFolder : directory.listFiles()) { if (oneSubFolder.isDirectory()) { String manifestPath = oneSubFolder.getAbsolutePath() + File.separator - + "plugin.xml"; + + "plugin.xml"; try { - LOG.debug("parsing: " + manifestPath); + LOG.debug("Parsing: {}", manifestPath); PluginDescriptor p = parseManifestFile(manifestPath); map.put(p.getPluginId(), p); } catch (Exception e) { - LOG.warn("Error while loading plugin `" + manifestPath + "` " - + e.toString()); + LOG.warn("Error while loading plugin {}: {}", manifestPath, e.toString()); } } } @@ -113,13 +111,13 @@ public class PluginManifestParser { if (!directory.isAbsolute()) { URL url = PluginManifestParser.class.getClassLoader().getResource(name); if (url == null && directory.exists() && directory.isDirectory() - && directory.listFiles().length > 0) { + && directory.listFiles().length > 0) { return directory; // relative path that is not in the classpath } else if (url == null) { - LOG.warn("Plugins: directory not found: " + name); + LOG.warn("Plugins: directory not found: {}", name); return null; } else if (!"file".equals(url.getProtocol())) { - LOG.warn("Plugins: not a file: url. Can't load plugins from: " + url); + LOG.warn("Plugins: not a file: url. Can't load plugins from: {}", url); return null; } String path = url.getPath(); @@ -131,7 +129,7 @@ public class PluginManifestParser { } directory = new File(path); } else if (!directory.exists()) { - LOG.warn("Plugins: directory not found: " + name); + LOG.warn("Plugins: directory not found: {}", name); return null; } return directory; @@ -145,8 +143,8 @@ public class PluginManifestParser { * @throws MalformedURLException */ private PluginDescriptor parseManifestFile(String pManifestPath) - throws MalformedURLException, SAXException, IOException, - ParserConfigurationException { + throws MalformedURLException, SAXException, IOException, + ParserConfigurationException { Document document = parseXML(new File(pManifestPath).toURI().toURL()); String pPath = new File(pManifestPath).getParent(); return parsePlugin(document, pPath); @@ -160,8 +158,8 @@ public class PluginManifestParser { * @throws ParserConfigurationException * @throws DocumentException */ - private Document parseXML(URL url) throws SAXException, IOException, - ParserConfigurationException { + private Document parseXML(URL url) + throws SAXException, IOException, ParserConfigurationException { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); return builder.parse(url.openStream()); @@ -172,7 +170,7 @@ public class PluginManifestParser { * @throws MalformedURLException */ private PluginDescriptor parsePlugin(Document pDocument, String pPath) - throws MalformedURLException { + throws MalformedURLException { Element rootElement = pDocument.getDocumentElement(); String id = rootElement.getAttribute(ATTR_ID); String name = rootElement.getAttribute(ATTR_NAME); @@ -183,9 +181,9 @@ public class PluginManifestParser { pluginClazz = rootElement.getAttribute(ATTR_CLASS); } PluginDescriptor pluginDescriptor = new PluginDescriptor(id, version, name, - providerName, pluginClazz, pPath, this.conf); - LOG.debug("plugin: id=" + id + " name=" + name + " version=" + version - + " provider=" + providerName + "class=" + pluginClazz); + providerName, pluginClazz, pPath, this.conf); + LOG.debug("plugin: id={} name={} version={} provider={} class={}", + id, name, version, providerName, pluginClazz); parseExtension(rootElement, pluginDescriptor); parseExtensionPoints(rootElement, pluginDescriptor); parseLibraries(rootElement, pluginDescriptor); @@ -199,7 +197,7 @@ public class PluginManifestParser { * @throws MalformedURLException */ private void parseRequires(Element pRootElement, PluginDescriptor pDescriptor) - throws MalformedURLException { + throws MalformedURLException { NodeList nodelist = pRootElement.getElementsByTagName("requires"); if (nodelist.getLength() > 0) { @@ -222,8 +220,8 @@ public class PluginManifestParser { * @param pDescriptor * @throws MalformedURLException */ - private void parseLibraries(Element pRootElement, PluginDescriptor pDescriptor) - throws MalformedURLException { + private void parseLibraries(Element pRootElement, + PluginDescriptor pDescriptor) throws MalformedURLException { NodeList nodelist = pRootElement.getElementsByTagName("runtime"); if (nodelist.getLength() > 0) { @@ -248,7 +246,7 @@ public class PluginManifestParser { * @param pluginDescriptor */ private void parseExtensionPoints(Element pRootElement, - PluginDescriptor pPluginDescriptor) { + PluginDescriptor pPluginDescriptor) { NodeList list = pRootElement.getElementsByTagName("extension-point"); if (list != null) { for (int i = 0; i < list.getLength(); i++) { @@ -267,7 +265,7 @@ public class PluginManifestParser { * @param pluginDescriptor */ private void parseExtension(Element pRootElement, - PluginDescriptor pPluginDescriptor) { + PluginDescriptor pPluginDescriptor) { NodeList extensions = pRootElement.getElementsByTagName("extension"); if (extensions != null) { for (int i = 0; i < extensions.getLength(); i++) { @@ -286,14 +284,14 @@ public class PluginManifestParser { String extensionClass = oneImplementation.getAttribute(ATTR_CLASS); LOG.debug("impl: point=" + pointId + " class=" + extensionClass); Extension extension = new Extension(pPluginDescriptor, pointId, id, - extensionClass, this.conf, this.pluginRepository); + extensionClass, this.conf, this.pluginRepository); NodeList parameters = oneImplementation - .getElementsByTagName("parameter"); + .getElementsByTagName("parameter"); if (parameters != null) { for (int k = 0; k < parameters.getLength(); k++) { Element param = (Element) parameters.item(k); extension.addAttribute(param.getAttribute(ATTR_NAME), - param.getAttribute("value")); + param.getAttribute("value")); } } pPluginDescriptor.addExtension(extension); diff --git a/src/java/org/apache/nutch/plugin/PluginRepository.java b/src/java/org/apache/nutch/plugin/PluginRepository.java index 44df3a2..726da45 100644 --- a/src/java/org/apache/nutch/plugin/PluginRepository.java +++ b/src/java/org/apache/nutch/plugin/PluginRepository.java @@ -21,30 +21,39 @@ import java.lang.reflect.Array; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; +import java.net.URLStreamHandler; +import java.net.URLStreamHandlerFactory; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; -import java.util.WeakHashMap; import java.util.List; import java.util.Map; +import java.util.WeakHashMap; import java.util.regex.Pattern; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.ObjectCache; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * The plugin repositority is a registry of all plugins. + * <p>The plugin repositority is a registry of all plugins.</p> * - * At system boot up a repositority is built by parsing the mainifest files of + * <p>At system boot up a repositority is built by parsing the mainifest files of * all plugins. Plugins that require other plugins which do not exist are not * registed. For each plugin a plugin descriptor instance will be created. The * descriptor represents all meta information about a plugin. So a plugin * instance will be created later when it is required, this allow lazy plugin - * loading. + * loading.</p> + * + * <p>As protocol-plugins need to be registered with the JVM as well, this class + * also acts as an {@link java.net.URLStreamHandlerFactory} that registers with + * the JVM and supports all the new protocols as if they were native. Details of + * how the JVM creates URLs can be seen in the API documentation for the + * <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/net/URL.html#%3Cinit%3E(java.lang.String,java.lang.String,int,java.lang.String)">URL constructor</a>.</p> */ -public class PluginRepository { +public class PluginRepository implements URLStreamHandlerFactory { private static final WeakHashMap<String, PluginRepository> CACHE = new WeakHashMap<>(); private boolean auto; @@ -59,8 +68,7 @@ public class PluginRepository { private Configuration conf; - protected static final Logger LOG = LoggerFactory - .getLogger(MethodHandles.lookup().lookupClass()); + protected static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); /** * @param conf a populated {@link Configuration} @@ -73,26 +81,29 @@ public class PluginRepository { this.auto = conf.getBoolean("plugin.auto-activation", true); String[] pluginFolders = conf.getStrings("plugin.folders"); PluginManifestParser manifestParser = new PluginManifestParser(this.conf, - this); + this); Map<String, PluginDescriptor> allPlugins = manifestParser - .parsePluginFolder(pluginFolders); + .parsePluginFolder(pluginFolders); if (allPlugins.isEmpty()) { LOG.warn("No plugins found on paths of property plugin.folders=\"{}\"", - conf.get("plugin.folders")); + conf.get("plugin.folders")); } Pattern excludes = Pattern.compile(conf.get("plugin.excludes", "")); Pattern includes = Pattern.compile(conf.get("plugin.includes", "")); Map<String, PluginDescriptor> filteredPlugins = filter(excludes, includes, - allPlugins); + allPlugins); fRegisteredPlugins = getDependencyCheckedPlugins(filteredPlugins, - this.auto ? allPlugins : filteredPlugins); + this.auto ? allPlugins : filteredPlugins); installExtensionPoints(fRegisteredPlugins); try { installExtensions(fRegisteredPlugins); } catch (PluginRuntimeException e) { - LOG.error(e.toString()); + LOG.error("Could not install extensions.", e.toString()); throw new RuntimeException(e.getMessage()); } + + registerURLStreamHandlerFactory(); + displayStatus(); } @@ -122,7 +133,7 @@ public class PluginRepository { for (PluginDescriptor plugin : plugins) { for (ExtensionPoint point : plugin.getExtenstionPoints()) { String xpId = point.getId(); - LOG.debug("Adding extension point " + xpId); + LOG.debug("Adding extension point {}", xpId); fExtensionPoints.put(xpId, point); } } @@ -132,16 +143,15 @@ public class PluginRepository { * @param pRegisteredPlugins */ private void installExtensions(List<PluginDescriptor> pRegisteredPlugins) - throws PluginRuntimeException { + throws PluginRuntimeException { for (PluginDescriptor descriptor : pRegisteredPlugins) { for (Extension extension : descriptor.getExtensions()) { String xpId = extension.getTargetPoint(); ExtensionPoint point = getExtensionPoint(xpId); if (point == null) { - throw new PluginRuntimeException("Plugin (" - + descriptor.getPluginId() + "), " + "extension point: " + xpId - + " does not exist."); + throw new PluginRuntimeException("Plugin (" + descriptor.getPluginId() + + "), " + "extension point: " + xpId + " does not exist."); } point.addExtension(extension); } @@ -149,10 +159,10 @@ public class PluginRepository { } private void getPluginCheckedDependencies(PluginDescriptor plugin, - Map<String, PluginDescriptor> plugins, - Map<String, PluginDescriptor> dependencies, - Map<String, PluginDescriptor> branch) throws MissingDependencyException, - CircularDependencyException { + Map<String, PluginDescriptor> plugins, + Map<String, PluginDescriptor> dependencies, + Map<String, PluginDescriptor> branch) + throws MissingDependencyException, CircularDependencyException { if (dependencies == null) { dependencies = new HashMap<>(); @@ -166,24 +176,24 @@ public class PluginRepository { for (String id : plugin.getDependencies()) { PluginDescriptor dependency = plugins.get(id); if (dependency == null) { - throw new MissingDependencyException("Missing dependency " + id - + " for plugin " + plugin.getPluginId()); + throw new MissingDependencyException( + "Missing dependency " + id + " for plugin " + plugin.getPluginId()); } if (branch.containsKey(id)) { throw new CircularDependencyException("Circular dependency detected " - + id + " for plugin " + plugin.getPluginId()); + + id + " for plugin " + plugin.getPluginId()); } dependencies.put(id, dependency); getPluginCheckedDependencies(plugins.get(id), plugins, dependencies, - branch); + branch); } branch.remove(plugin.getPluginId()); } private Map<String, PluginDescriptor> getPluginCheckedDependencies( - PluginDescriptor plugin, Map<String, PluginDescriptor> plugins) - throws MissingDependencyException, CircularDependencyException { + PluginDescriptor plugin, Map<String, PluginDescriptor> plugins) + throws MissingDependencyException, CircularDependencyException { Map<String, PluginDescriptor> dependencies = new HashMap<>(); Map<String, PluginDescriptor> branch = new HashMap<>(); getPluginCheckedDependencies(plugin, plugins, dependencies, branch); @@ -198,7 +208,8 @@ public class PluginRepository { * @return List */ private List<PluginDescriptor> getDependencyCheckedPlugins( - Map<String, PluginDescriptor> filtered, Map<String, PluginDescriptor> all) { + Map<String, PluginDescriptor> filtered, + Map<String, PluginDescriptor> all) { if (filtered == null) { return null; } @@ -209,7 +220,7 @@ public class PluginRepository { checked.putAll(getPluginCheckedDependencies(plugin, all)); checked.put(plugin.getPluginId(), plugin); } catch (MissingDependencyException mde) { - // Logger exception and ignore plugin + // Log exception and ignore plugin LOG.warn(mde.getMessage()); } catch (CircularDependencyException cde) { // Simply ignore this plugin @@ -225,8 +236,8 @@ public class PluginRepository { * @return PluginDescriptor[] */ public PluginDescriptor[] getPluginDescriptors() { - return fRegisteredPlugins.toArray(new PluginDescriptor[fRegisteredPlugins - .size()]); + return fRegisteredPlugins + .toArray(new PluginDescriptor[fRegisteredPlugins.size()]); } /** @@ -255,14 +266,14 @@ public class PluginRepository { } /** - * Returns a instance of a plugin. Plugin instances are cached. So a plugin + * <p>Returns a instance of a plugin. Plugin instances are cached. So a plugin * exist only as one instance. This allow a central management of plugin own - * resources. + * resources.</p> * - * After creating the plugin instance the startUp() method is invoked. The + * <p>After creating the plugin instance the startUp() method is invoked. The * plugin use a own classloader that is used as well by all instance of * extensions of the same plugin. This class loader use all exported libraries - * from the dependend plugins and all plugin libraries. + * from the dependend plugins and all plugin libraries.</p> * * @param pDescriptor a {@link PluginDescriptor} for which to retrieve a * {@link Plugin} instance @@ -270,7 +281,7 @@ public class PluginRepository { * @throws PluginRuntimeException if there is a fatal runtime plugin error */ public Plugin getPluginInstance(PluginDescriptor pDescriptor) - throws PluginRuntimeException { + throws PluginRuntimeException { if (fActivatedPlugins.containsKey(pDescriptor.getPluginId())) return fActivatedPlugins.get(pDescriptor.getPluginId()); try { @@ -280,11 +291,11 @@ public class PluginRepository { // Suggested by Stefan Groschupf <s...@media-style.com> synchronized (pDescriptor) { Class<?> pluginClass = getCachedClass(pDescriptor, - pDescriptor.getPluginClass()); - Constructor<?> constructor = pluginClass.getConstructor(new Class<?>[] { - PluginDescriptor.class, Configuration.class }); - Plugin plugin = (Plugin) constructor.newInstance(new Object[] { - pDescriptor, this.conf }); + pDescriptor.getPluginClass()); + Constructor<?> constructor = pluginClass.getConstructor( + new Class<?>[] { PluginDescriptor.class, Configuration.class }); + Plugin plugin = (Plugin) constructor + .newInstance(new Object[] { pDescriptor, this.conf }); plugin.startUp(); fActivatedPlugins.put(pDescriptor.getPluginId(), plugin); return plugin; @@ -302,11 +313,13 @@ public class PluginRepository { } } - /* - * (non-Javadoc) - * + /** + * Attempts to shut down all activated plugins. + * @deprecated + * @see <a href="https://openjdk.java.net/jeps/421">JEP 421: Deprecate Finalization for Removal</a> * @see java.lang.Object#finalize() */ + @Deprecated public void finalize() throws Throwable { shutDownActivatedPlugins(); } @@ -323,7 +336,7 @@ public class PluginRepository { } public Class getCachedClass(PluginDescriptor pDescriptor, String className) - throws ClassNotFoundException { + throws ClassNotFoundException { Map<PluginClassLoader, Class> descMap = CLASS_CACHE.get(className); if (descMap == null) { descMap = new HashMap<>(); @@ -339,14 +352,14 @@ public class PluginRepository { } private void displayStatus() { - LOG.info("Plugin Auto-activation mode: [" + this.auto + "]"); + LOG.info("Plugin Auto-activation mode: [{}]", this.auto); LOG.info("Registered Plugins:"); if ((fRegisteredPlugins == null) || (fRegisteredPlugins.size() == 0)) { LOG.info("\tNONE"); } else { for (PluginDescriptor plugin : fRegisteredPlugins) { - LOG.info("\t" + plugin.getName() + " (" + plugin.getPluginId() + ")"); + LOG.info("\t{} ({})", plugin.getName(), plugin.getPluginId()); } } @@ -355,7 +368,7 @@ public class PluginRepository { LOG.info("\tNONE"); } else { for (ExtensionPoint ep : fExtensionPoints.values()) { - LOG.info("\t" + ep.getName() + " (" + ep.getId() + ")"); + LOG.info("\t ({})", ep.getName(), ep.getId()); } } } @@ -372,7 +385,7 @@ public class PluginRepository { * @return map of plugins matching the configuration */ private Map<String, PluginDescriptor> filter(Pattern excludes, - Pattern includes, Map<String, PluginDescriptor> plugins) { + Pattern includes, Map<String, PluginDescriptor> plugins) { Map<String, PluginDescriptor> map = new HashMap<>(); @@ -391,11 +404,11 @@ public class PluginRepository { } if (!includes.matcher(id).matches()) { - LOG.debug("not including: " + id); + LOG.debug("not including: {}", id); continue; } if (excludes.matcher(id).matches()) { - LOG.debug("excluding: " + id); + LOG.debug("excluding: {}", id); continue; } map.put(plugin.getPluginId(), plugin); @@ -419,7 +432,7 @@ public class PluginRepository { * @return array of plugin instances */ public synchronized Object[] getOrderedPlugins(Class<?> clazz, - String xPointId, String orderProperty) { + String xPointId, String orderProperty) { Object[] filters; ObjectCache objectCache = ObjectCache.get(conf); filters = (Object[]) objectCache.getObject(clazz.getName()); @@ -434,8 +447,8 @@ public class PluginRepository { } try { - ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( - xPointId); + ExtensionPoint point = PluginRepository.get(conf) + .getExtensionPoint(xPointId); if (point == null) throw new RuntimeException(xPointId + " not found."); Extension[] extensions = point.getExtensions(); @@ -453,9 +466,9 @@ public class PluginRepository { for (String orderedFilter : orderOfFilters) { Object f = filterMap.get(orderedFilter); if (f == null) { - LOG.error(clazz.getSimpleName() + " : " + orderedFilter - + " declared in configuration property " + orderProperty - + " but not found in an active plugin - ignoring."); + LOG.error("{} : {} declared in configuration property {} " + + "but not found in an active plugin - ignoring.", + clazz.getSimpleName(), orderedFilter, orderProperty); continue; } sorted.add(f); @@ -464,8 +477,8 @@ public class PluginRepository { for (int i = 0; i < sorted.size(); i++) { filter[i] = sorted.get(i); if (LOG.isTraceEnabled()) { - LOG.trace(clazz.getSimpleName() + " : filters[" + i + "] = " - + filter[i].getClass()); + LOG.trace("{} : filters[{}] = {}", clazz.getSimpleName() , i, + filter[i].getClass()); } } objectCache.setObject(clazz.getName(), filter); @@ -490,8 +503,8 @@ public class PluginRepository { */ public static void main(String[] args) throws Exception { if (args.length < 2) { - System.err - .println("Usage: PluginRepository pluginId className [arg1 arg2 ...]"); + System.err.println( + "Usage: PluginRepository pluginId className [arg1 arg2 ...]"); return; } Configuration conf = NutchConfiguration.create(); @@ -508,8 +521,8 @@ public class PluginRepository { try { clazz = Class.forName(args[1], true, cl); } catch (Exception e) { - System.err.println("Could not load the class '" + args[1] + ": " - + e.getMessage()); + System.err.println( + "Could not load the class '" + args[1] + ": " + e.getMessage()); return; } Method m = null; @@ -517,11 +530,108 @@ public class PluginRepository { m = clazz.getMethod("main", new Class<?>[] { args.getClass() }); } catch (Exception e) { System.err.println("Could not find the 'main(String[])' method in class " - + args[1] + ": " + e.getMessage()); + + args[1] + ": " + e.getMessage()); return; } String[] subargs = new String[args.length - 2]; System.arraycopy(args, 2, subargs, 0, subargs.length); m.invoke(null, new Object[] { subargs }); } + + /** + * Registers this PluginRepository to be invoked whenever URLs have to be + * parsed. This allows to check the registered protocol plugins for uncommon + * protocols. + */ + private void registerURLStreamHandlerFactory() { + org.apache.nutch.plugin.URLStreamHandlerFactory.getInstance().registerPluginRepository(this); + } + + /** + * <p>Invoked whenever a {@link java.net.URL} needs to be instantiated. Tries to find a + * suitable extension and allows it to provide a {@link java.net.URLStreamHandler}.</p> + * This is done by several attempts: + * <ul> + * <li>Find a protocol plugin that implements the desired protocol. If found, + * instantiate it so eventually the plugin can install a {@link java.net.URLStreamHandler} + * through a static hook.</li> + * <li>If the plugin specifies a {@link java.net.URLStreamHandler} in its + * <code>plugin.xml</code> manifest, return an instance of this + * {@link java.net.URLStreamHandler}. Example: + * + * <pre> + * ... + * <implementation id="org.apache.nutch.protocol.foo.Foo" class="org.apache.nutch.protocol.foo.Foo"> + * <parameter name="protocolName" value="foo"/> + * <parameter name="urlStreamHandler" value="org.apache.nutch.protocol.foo.Handler"/> + * </implementation> + * ... + * </pre> + * </li> + * <li>If all else fails, return null. This will fallback to the JVM's method + * of evaluating the system property <code>java.protocol.handler.pkgs</code>.</li> + * </ul> + * + * @return the URLStreamHandler found, or null. + * @see java.net.URL + * @see <a href="https://issues.apache.org/jira/browse/NUTCH-2429">NUTCH-2429</a> + */ + public URLStreamHandler createURLStreamHandler(String protocol) { + LOG.debug("Creating URLStreamHandler for protocol: {}", protocol); + + if (fExtensionPoints != null) { + ExtensionPoint ep = fExtensionPoints + .get("org.apache.nutch.protocol.Protocol"); + if (ep != null) { + Extension[] extensions = ep.getExtensions(); + for (Extension extension : extensions) { + String p = extension.getAttribute("protocolName"); + if (p.equals(protocol)) { + LOG.debug("Suitable protocolName attribute located: {}", p); + + // instantiate the plugin. This allows it to execute a static hook, + // if present. Extensions and PluginInstances are cached already, so we + // should not create too many instances + Object extinst = null; + try { + extinst = extension.getExtensionInstance(); + LOG.debug("Located extension instance class: {}", extinst.getClass().getName()); + } catch (Exception e) { + LOG.warn("Could not find {}", extension.getId(), e); + } + + // return the handler here, if possible + String handlerClass = extension.getAttribute("urlStreamHandler"); + LOG.debug("Located URLStreamHandler: {}", handlerClass); + if (handlerClass != null) { + // the nutch classloader + ClassLoader cl = this.getClass().getClassLoader(); + if (extinst != null) { + // the extension's classloader + cl = extinst.getClass().getClassLoader(); + } + + try { + // instantiate the handler and return it + Class<?> clazz = cl.loadClass(handlerClass); + return (URLStreamHandler) clazz.getDeclaredConstructor().newInstance(); + } catch (Exception e) { + LOG.error("Could not instantiate protocol {} handler class {} defined by extension {}", + protocol, handlerClass, extension.getId(), e); + return null; + } + } + + LOG.debug("suitable protocol extension found that did not declare a handler"); + return null; + } + } + LOG.debug("No suitable protocol extensions registered"); + } else { + LOG.debug("No protocol extensions registered?"); + } + } + + return null; + } } diff --git a/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java b/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java new file mode 100644 index 0000000..a64454c --- /dev/null +++ b/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java @@ -0,0 +1,115 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +import java.lang.ref.WeakReference; +import java.net.URL; +import java.net.URLStreamHandler; +import java.util.ArrayList; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This URLStreamHandlerFactory knows about all the plugins + * in use and thus can create the correct URLStreamHandler + * even if it comes from a plugin classpath. + * As the JVM allows only one instance of URLStreamHandlerFactory + * to be registered, this class implements a singleton pattern. + * @author Hiran Chaudhuri + * + */ +public class URLStreamHandlerFactory + implements java.net.URLStreamHandlerFactory { + + protected static final Logger LOG = LoggerFactory + .getLogger(URLStreamHandlerFactory.class); + + /** The singleton instance. */ + private static URLStreamHandlerFactory instance; + + /** Here we register all PluginRepositories. + * In this class we do not know why several instances of PluginRepository + * are kept, nor do we know how long they will be used. To prevent + * a memory leak, this class must not keep references to PluginRepository + * but use WeakReference which allows PluginRepository to still be + * garbage collected. The prize is we need to clean the list for + * outdated references which is done in the {@link #removeInvalidRefs()} method. + */ + private ArrayList<WeakReference<PluginRepository>> prs; + + static { + instance = new URLStreamHandlerFactory(); + URL.setURLStreamHandlerFactory(instance); + LOG.debug("Registered URLStreamHandlerFactory with the JVM."); + } + + private URLStreamHandlerFactory() { + prs = new ArrayList<>(); + } + + /** + * Get the singleton instance of this class. + * @return a {@link org.apache.nutch.plugin.URLStreamHandlerFactory} instance + */ + public static URLStreamHandlerFactory getInstance() { + return instance; + } + + /** Use this method once a new PluginRepository was created to register it. + * + * @param pr The PluginRepository to be registered. + */ + public void registerPluginRepository(PluginRepository pr) { + prs.add(new WeakReference<PluginRepository>(pr)); + + removeInvalidRefs(); + } + + @Override + public URLStreamHandler createURLStreamHandler(String protocol) { + LOG.debug("Creating URLStreamHandler for protocol: {}", protocol); + + removeInvalidRefs(); + + // find the 'correct' PluginRepository. For now we simply take the first. + // then ask it to return the URLStreamHandler + for(WeakReference<PluginRepository> ref: prs) { + PluginRepository pr = ref.get(); + if(pr != null) { + // found PluginRepository. Let's get the URLStreamHandler... + return pr.createURLStreamHandler(protocol); + } + } + return null; + } + + /** Maintains the list of PluginRepositories by + * removing the references whose referents have been + * garbage collected meanwhile. + */ + private void removeInvalidRefs() { + LOG.debug("removeInvalidRefs()"); + ArrayList<WeakReference<PluginRepository>> copy = new ArrayList<>(prs); + for(WeakReference<PluginRepository> ref: copy) { + if(ref.get() == null) { + prs.remove(ref); + } + } + LOG.debug("Removed the following invalid references: '{}' Remaining: '{}'", copy.size()-prs.size(), prs.size()); + } +} diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java b/src/java/org/apache/nutch/util/CrawlCompletionStats.java index 8a23fbf..621484c 100644 --- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java +++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java @@ -16,40 +16,37 @@ */ package org.apache.nutch.util; -import java.io.File; import java.io.IOException; import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; import java.net.URL; import java.text.SimpleDateFormat; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.MissingOptionException; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.nutch.util.TimingUtil; -import org.apache.nutch.util.URLUtil; - -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.CommandLineParser; -import org.apache.commons.cli.GnuParser; -import org.apache.commons.cli.HelpFormatter; -import org.apache.commons.cli.Option; -import org.apache.commons.cli.OptionBuilder; -import org.apache.commons.cli.Options; -import org.apache.commons.cli.MissingOptionException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Extracts some simple crawl completion stats from the crawldb @@ -204,7 +201,14 @@ public class CrawlCompletionStats extends Configured implements Tool { public void map(Text urlText, CrawlDatum datum, Context context) throws IOException, InterruptedException { - URL url = new URL(urlText.toString()); + URL url; + try { + url = new URL(urlText.toString()); + } catch (MalformedURLException e) { + LOG.error("Failed to get host or domain from URL {}: {}", + urlText, e.getMessage()); + return; + } String out = ""; switch (mode) { case MODE_HOST: diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java index 04b38df..3e852eb 100644 --- a/src/java/org/apache/nutch/util/NutchJob.java +++ b/src/java/org/apache/nutch/util/NutchJob.java @@ -19,12 +19,13 @@ package org.apache.nutch.util; import java.io.IOException; import java.lang.invoke.MethodHandles; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.Job; +import org.apache.nutch.plugin.PluginRepository; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** A {@link Job} for Nutch jobs. */ public class NutchJob extends Job { @@ -35,6 +36,11 @@ public class NutchJob extends Job { @SuppressWarnings("deprecation") public NutchJob(Configuration conf, String jobName) throws IOException { super(conf, jobName); + if (conf != null) { + // initialize plugins early to register URL stream handlers to support + // custom protocol implementations + PluginRepository.get(conf); + } } public static Job getInstance(Configuration conf) throws IOException { diff --git a/src/java/org/apache/nutch/util/NutchTool.java b/src/java/org/apache/nutch/util/NutchTool.java index 244ae99..a6a1cee 100644 --- a/src/java/org/apache/nutch/util/NutchTool.java +++ b/src/java/org/apache/nutch/util/NutchTool.java @@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.mapreduce.Job; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.plugin.PluginRepository; public abstract class NutchTool extends Configured { @@ -53,6 +54,14 @@ public abstract class NutchTool extends Configured { public NutchTool(){ super(null); } + + @Override + public void setConf(Configuration conf) { + super.setConf(conf); + if(conf != null) { + PluginRepository.get(conf); + } + } /** * Get relative progress of the tool. Progress is represented as a diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java index b191f23..1a1955e 100644 --- a/src/java/org/apache/nutch/util/SitemapProcessor.java +++ b/src/java/org/apache/nutch/util/SitemapProcessor.java @@ -41,7 +41,6 @@ import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; - import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.hostdb.HostDatum; import org.apache.nutch.net.URLFilters; @@ -51,7 +50,6 @@ import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolFactory; import org.apache.nutch.protocol.ProtocolOutput; import org.apache.nutch.protocol.ProtocolStatus; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -82,13 +80,11 @@ import crawlercommons.sitemaps.SiteMapURL; * the sitemaps into the CrawlDb.</li> * </ol> * - * <p> - * For more details see: - * https://cwiki.apache.org/confluence/display/NUTCH/SitemapFeature - * </p> + * @see + * <a href="https://cwiki.apache.org/confluence/display/NUTCH/SitemapFeature">SitemapFeature</a> */ public class SitemapProcessor extends Configured implements Tool { - public static final Logger LOG = LoggerFactory.getLogger(SitemapProcessor.class); + private static final Logger LOG = LoggerFactory.getLogger(SitemapProcessor.class); public static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); public static final String CURRENT_NAME = "current"; diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/domain/DomainStatistics.java index 24e7a1c..0d789ed 100644 --- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java +++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java @@ -16,31 +16,32 @@ */ package org.apache.nutch.util.domain; -import java.io.File; import java.io.IOException; import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; import java.net.URL; import java.text.SimpleDateFormat; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Extracts some very basic statistics about domains from the crawldb @@ -175,7 +176,14 @@ public class DomainStatistics extends Configured implements Tool { || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { try { - URL url = new URL(urlText.toString()); + URL url; + try { + url = new URL(urlText.toString()); + } catch (MalformedURLException e) { + LOG.error("Failed to get host or domain from URL {}: {}", + urlText, e.getMessage()); + return; + } String out = null; switch (mode) { case MODE_HOST: diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java index e56aaa6..c0f1d6f 100644 --- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java +++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java @@ -41,7 +41,7 @@ import org.slf4j.LoggerFactory; public class Any23IndexingFilter implements IndexingFilter { /** Logging instance */ - public static final Logger LOG = LoggerFactory.getLogger(Any23IndexingFilter.class); + private static final Logger LOG = LoggerFactory.getLogger(Any23IndexingFilter.class); public static final String STRUCTURED_DATA = "structured_data"; diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java index d9f0896..af7f135 100644 --- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java +++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java @@ -58,7 +58,7 @@ import org.w3c.dom.DocumentFragment; public class Any23ParseFilter implements HtmlParseFilter { /** Logging instance */ - public static final Logger LOG = LoggerFactory.getLogger(Any23ParseFilter.class); + private static final Logger LOG = LoggerFactory.getLogger(Any23ParseFilter.class); private Configuration conf = null; diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 95d7a16..7378096 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -71,6 +71,7 @@ <ant dir="parsefilter-naivebayes" target="deploy"/> <ant dir="parsefilter-regex" target="deploy"/> <ant dir="protocol-file" target="deploy"/> + <ant dir="protocol-foo" target="deploy" /> <ant dir="protocol-ftp" target="deploy"/> <ant dir="protocol-htmlunit" target="deploy" /> <ant dir="protocol-http" target="deploy"/> @@ -219,6 +220,7 @@ <ant dir="parsefilter-naivebayes" target="clean" /> <ant dir="parsefilter-regex" target="clean"/> <ant dir="protocol-file" target="clean"/> + <ant dir="protocol-foo" target="clean" /> <ant dir="protocol-ftp" target="clean"/> <ant dir="protocol-htmlunit" target="clean" /> <ant dir="protocol-http" target="clean"/> diff --git a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java index 6989feb..58e8993 100644 --- a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java +++ b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java @@ -52,7 +52,7 @@ import org.slf4j.LoggerFactory; */ public class CSVIndexWriter implements IndexWriter { - public static final Logger LOG = LoggerFactory + private static final Logger LOG = LoggerFactory .getLogger(CSVIndexWriter.class); private Configuration config; diff --git a/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java b/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java index f7a47e2..37acf12 100644 --- a/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java +++ b/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java @@ -40,7 +40,7 @@ import java.util.regex.Pattern; public class RabbitIndexWriter implements IndexWriter { - public static final Logger LOG = LoggerFactory + private static final Logger LOG = LoggerFactory .getLogger(RabbitIndexWriter.class); private String uri; diff --git a/src/plugin/protocol-foo/build.xml b/src/plugin/protocol-foo/build.xml new file mode 100755 index 0000000..240f448 --- /dev/null +++ b/src/plugin/protocol-foo/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="protocol-foo" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> diff --git a/src/plugin/protocol-foo/ivy.xml b/src/plugin/protocol-foo/ivy.xml new file mode 100755 index 0000000..1a86d68 --- /dev/null +++ b/src/plugin/protocol-foo/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> diff --git a/src/plugin/protocol-foo/plugin.xml b/src/plugin/protocol-foo/plugin.xml new file mode 100755 index 0000000..850afe3 --- /dev/null +++ b/src/plugin/protocol-foo/plugin.xml @@ -0,0 +1,48 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<plugin + id="protocol-foo" + name="Foo Protocol Example Plug-in" + version="1.0.0" + provider-name="Hiran Chaudhuri"> + + <runtime> + <library name="protocol-foo.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.protocol.foo" + name="FooProtocol" + point="org.apache.nutch.protocol.Protocol"> + + <implementation id="org.apache.nutch.protocol.foo.Foo" + class="org.apache.nutch.protocol.foo.Foo"> + <parameter name="protocolName" value="foo"/> + <parameter name="urlStreamHandler" value="org.apache.nutch.protocol.foo.Handler"/> + </implementation> + + </extension> + +</plugin> diff --git a/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Foo.java b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Foo.java new file mode 100755 index 0000000..0f56f23 --- /dev/null +++ b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Foo.java @@ -0,0 +1,141 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.foo; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.protocol.RobotRulesParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import crawlercommons.robots.BaseRobotRules; + +public class Foo implements Protocol { + protected static final Logger LOG = LoggerFactory.getLogger(Foo.class); + + private Configuration conf; + + @Override + public Configuration getConf() { + LOG.debug("getConf()"); + return this.conf; + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + } + + /** + * This is a dummy implementation only. So what we will do is return this + * structure: + * + * <pre> + * foo://example.com - will contain one directory and one file + * foo://example.com/a - directory, will contain two files + * foo://example.com/a/aa.txt - text file + * foo://example.com/a/ab.txt - text file + * foo://example.com/a.txt - text file + * </pre> + */ + @Override + public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { + LOG.debug("getProtocolOutput({}, {})", url, datum); + + try { + String urlstr = String.valueOf(url); + URL u = new URL(urlstr); + URL base = new URL(u, "."); + byte[] bytes = new byte[0]; + String contentType = "foo/something"; + ProtocolStatus status = ProtocolStatus.STATUS_GONE; + + switch (urlstr) { + case "foo://example.com": + case "foo://example.com/": { + String time = HttpDateFormat.toString(System.currentTimeMillis()); + contentType = "text/html"; + StringBuffer sb = new StringBuffer(); + sb.append("<html><head>"); + sb.append("<title>Index of /</title></head>\n"); + sb.append("<body><h1>Index of /</h1><pre>\n"); + sb.append("<a href='a/" + "'>a/</a>\t"+ time + "\t-\n"); // add directory + sb.append("<a href='a.txt'>a.txt</a>\t" + time + "\t" + 0 + "\n"); // add file + sb.append("</pre></html></body>"); + bytes = sb.toString().getBytes(); + status = ProtocolStatus.STATUS_SUCCESS; + break; + } + case "foo://example.com/a/": { + String time = HttpDateFormat.toString(System.currentTimeMillis()); + contentType = "text/html"; + StringBuffer sb = new StringBuffer(); + sb.append("<html><head>"); + sb.append("<title>Index of /a/</title></head>\n"); + sb.append("<body><h1>Index of /a/</h1><pre>\n"); + sb.append("<a href='aa.txt'>aa.txt</a>\t" + time + "\t" + 0 + "\n"); // add file + sb.append("<a href='ab.txt'>ab.txt</a>\t" + time + "\t" + 0 + "\n"); // add file + sb.append("</pre></html></body>"); + bytes = sb.toString().getBytes(); + status = ProtocolStatus.STATUS_SUCCESS; + break; + } + case "foo://example.com/a.txt": + case "foo://example.com/a/aa.txt": + case "foo://example.com/a/ab.txt": { + contentType = "text/plain"; + bytes = "In publishing and graphic design, lorem ipsum is a filler text or greeking commonly used to demonstrate the textual elements of a graphic document or visual presentation. Replacing meaningful content with placeholder text allows designers to design the form of the content before the content itself has been produced.".getBytes(); + status = ProtocolStatus.STATUS_SUCCESS; + break; + } + default: + LOG.warn("Unknown url '{}'. This dummy implementation only supports 'foo://example.com'", url); + // all our default values are set for URLs that do not exist. + break; + } + + Metadata metadata = new Metadata(); + Content content = new Content(String.valueOf(url), String.valueOf(base), + bytes, contentType, metadata, getConf()); + + return new ProtocolOutput(content, status); + } catch (MalformedURLException mue) { + LOG.error("Could not retrieve {}", url); + LOG.error("", mue); + // claim STATUS_GONE to tell nutch to never ever re-request this URL + return new ProtocolOutput(null, ProtocolStatus.STATUS_GONE); + } + } + + @Override + public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, + List<Content> robotsTxtContent) { + LOG.debug("getRobotRules({}, {}, {})", url, datum, robotsTxtContent); + return RobotRulesParser.EMPTY_RULES; + } +} diff --git a/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Handler.java b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Handler.java new file mode 100644 index 0000000..27f1837 --- /dev/null +++ b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Handler.java @@ -0,0 +1,28 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.foo; + +import java.net.URL; +import java.net.URLConnection; +import java.net.URLStreamHandler; + +public class Handler extends URLStreamHandler { + + protected URLConnection openConnection(URL u) { + throw new UnsupportedOperationException("not yet implemented"); + } +}