http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/ExtensionPoint.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/ExtensionPoint.java b/nutch-core/src/main/java/org/apache/nutch/plugin/ExtensionPoint.java new file mode 100644 index 0000000..178c5a2 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/plugin/ExtensionPoint.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +import java.util.ArrayList; + +/** + * The <code>ExtensionPoint</code> provide meta information of a extension + * point. + * + * @author joa23 + */ +public class ExtensionPoint { + private String ftId; + private String fName; + private String fSchema; + private ArrayList<Extension> fExtensions; + + /** + * Constructor + * + * @param pId + * unique extension point Id + * @param pName + * name of the extension point + * @param pSchema + * xml schema of the extension point + */ + public ExtensionPoint(String pId, String pName, String pSchema) { + setId(pId); + setName(pName); + setSchema(pSchema); + fExtensions = new ArrayList<Extension>(); + } + + /** + * Returns the unique id of the extension point. + * + * @return String + */ + public String getId() { + return ftId; + } + + /** + * Returns the name of the extension point. + * + * @return String + */ + public String getName() { + return fName; + } + + /** + * Returns a path to the xml schema of a extension point. + * + * @return String + */ + public String getSchema() { + return fSchema; + } + + /** + * Sets the extensionPointId. + * + * @param pId + * extension point id + */ + private void setId(String pId) { + ftId = pId; + } + + /** + * Sets the extension point name. + * + * @param pName + */ + private void setName(String pName) { + fName = pName; + } + + /** + * Sets the schema. + * + * @param pSchema + */ + private void setSchema(String pSchema) { + fSchema = pSchema; + } + + /** + * Install a coresponding extension to this extension point. + * + * @param extension + */ + public void addExtension(Extension extension) { + fExtensions.add(extension); + } + + /** + * Returns a array of extensions that lsiten to this extension point + * + * @return Extension[] + */ + public Extension[] getExtensions() { + return fExtensions.toArray(new Extension[fExtensions.size()]); + } + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/MissingDependencyException.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/MissingDependencyException.java b/nutch-core/src/main/java/org/apache/nutch/plugin/MissingDependencyException.java new file mode 100644 index 0000000..b81cc50 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/plugin/MissingDependencyException.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +/** + * <code>MissingDependencyException</code> will be thrown if a plugin dependency + * cannot be found. + * + * @author Jérôme Charron + */ +public class MissingDependencyException extends Exception { + + private static final long serialVersionUID = 1L; + + public MissingDependencyException(Throwable cause) { + super(cause); + } + + public MissingDependencyException(String message) { + super(message); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/Pluggable.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/Pluggable.java b/nutch-core/src/main/java/org/apache/nutch/plugin/Pluggable.java new file mode 100644 index 0000000..09aba30 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/plugin/Pluggable.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +/** + * Defines the capability of a class to be plugged into Nutch. This is a common + * interface that must be implemented by all Nutch Extension Points. + * + * @author Jérôme Charron + * + * @see <a href="http://wiki.apache.org/nutch/AboutPlugins">About Plugins</a> + * @see <a href="package-summary.html#package_description"> plugin package + * description</a> + */ +public interface Pluggable { + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/Plugin.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/Plugin.java b/nutch-core/src/main/java/org/apache/nutch/plugin/Plugin.java new file mode 100644 index 0000000..e78754b --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/plugin/Plugin.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +import org.apache.hadoop.conf.Configuration; + +/** + * A nutch-plugin is an container for a set of custom logic that provide + * extensions to the nutch core functionality or another plugin that provides an + * API for extending. A plugin can provide one or a set of extensions. + * Extensions are components that can be dynamically installed as a kind of + * listener to extension points. Extension points are a kind of publisher that + * provide a API and invoke one or a set of installed extensions. + * + * Each plugin may extend the base <code>Plugin</code>. <code>Plugin</code> + * instances are used as the point of life cycle managemet of plugin related + * functionality. + * + * The <code>Plugin</code> will be startuped and shutdown by the nutch plugin + * management system. + * + * A possible usecase of the <code>Plugin</code> implementation is to create or + * close a database connection. + * + * @author joa23 + */ +public class Plugin { + private PluginDescriptor fDescriptor; + protected Configuration conf; + + /** + * Constructor + * + */ + public Plugin(PluginDescriptor pDescriptor, Configuration conf) { + setDescriptor(pDescriptor); + this.conf = conf; + } + + /** + * Will be invoked until plugin start up. Since the nutch-plugin system use + * lazy loading the start up is invoked until the first time a extension is + * used. + * + * @throws PluginRuntimeException + * If the startup was without successs. + */ + public void startUp() throws PluginRuntimeException { + } + + /** + * Shutdown the plugin. This happens until nutch will be stopped. + * + * @throws PluginRuntimeException + * if a problems occurs until shutdown the plugin. + */ + public void shutDown() throws PluginRuntimeException { + } + + /** + * Returns the plugin descriptor + * + * @return PluginDescriptor + */ + public PluginDescriptor getDescriptor() { + return fDescriptor; + } + + /** + * @param descriptor + * The descriptor to set + */ + private void setDescriptor(PluginDescriptor descriptor) { + fDescriptor = descriptor; + } + + protected void finalize() throws Throwable { + super.finalize(); + shutDown(); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/PluginClassLoader.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/PluginClassLoader.java b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginClassLoader.java new file mode 100644 index 0000000..128bbc6 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginClassLoader.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +import java.net.URL; +import java.net.URLClassLoader; +import java.util.Arrays; + +/** + * The <code>PluginClassLoader</code> contains only classes of the runtime + * libraries setuped in the plugin manifest file and exported libraries of + * plugins that are required pluguin. Libraries can be exported or not. Not + * exported libraries are only used in the plugin own + * <code>PluginClassLoader</code>. Exported libraries are available for + * <code>PluginClassLoader</code> of plugins that depends on these plugins. + * + * @author joa23 + */ +public class PluginClassLoader extends URLClassLoader { + + private URL[] urls; + private ClassLoader parent; + + /** + * Construtor + * + * @param urls + * Array of urls with own libraries and all exported libraries of + * plugins that are required to this plugin + * @param parent + */ + public PluginClassLoader(URL[] urls, ClassLoader parent) { + super(urls, parent); + + this.urls = urls; + this.parent = parent; + } + + @Override + public int hashCode() { + final int PRIME = 31; + int result = 1; + result = PRIME * result + ((parent == null) ? 0 : parent.hashCode()); + result = PRIME * result + Arrays.hashCode(urls); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final PluginClassLoader other = (PluginClassLoader) obj; + if (parent == null) { + if (other.parent != null) + return false; + } else if (!parent.equals(other.parent)) + return false; + if (!Arrays.equals(urls, other.urls)) + return false; + return true; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/PluginDescriptor.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/PluginDescriptor.java b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginDescriptor.java new file mode 100644 index 0000000..0a43745 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginDescriptor.java @@ -0,0 +1,363 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URI; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Locale; +import java.util.MissingResourceException; +import java.util.ResourceBundle; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; + +/** + * The <code>PluginDescriptor</code> provide access to all meta information of a + * nutch-plugin, as well to the internationalizable resources and the plugin own + * classloader. There are meta information about <code>Plugin</code>, + * <code>ExtensionPoint</code> and <code>Extension</code>. To provide access to + * the meta data of a plugin via a descriptor allow a lazy loading mechanism. + */ +public class PluginDescriptor { + private String fPluginPath; + private String fPluginClass = Plugin.class.getName(); + private String fPluginId; + private String fVersion; + private String fName; + private String fProviderName; + private HashMap<String, ResourceBundle> fMessages = new HashMap<String, ResourceBundle>(); + private ArrayList<ExtensionPoint> fExtensionPoints = new ArrayList<ExtensionPoint>(); + private ArrayList<String> fDependencies = new ArrayList<String>(); + private ArrayList<URL> fExportedLibs = new ArrayList<URL>(); + private ArrayList<URL> fNotExportedLibs = new ArrayList<URL>(); + private ArrayList<Extension> fExtensions = new ArrayList<Extension>(); + private PluginClassLoader fClassLoader; + public static final Logger LOG = LoggerFactory + .getLogger(PluginDescriptor.class); + private Configuration fConf; + + /** + * Constructor + * + * @param pId + * @param pVersion + * @param pName + * @param pProviderName + * @param pPluginclazz + * @param pPath + */ + public PluginDescriptor(String pId, String pVersion, String pName, + String pProviderName, String pPluginclazz, String pPath, + Configuration conf) { + setPath(pPath); + setPluginId(pId); + setVersion(pVersion); + setName(pName); + setProvidername(pProviderName); + + if (pPluginclazz != null) + setPluginClass(pPluginclazz); + + this.fConf = conf; + } + + /** + * @param pPath + */ + private void setPath(String pPath) { + fPluginPath = pPath; + } + + /** + * Returns the name of the plugin. + * + * @return String + */ + public String getName() { + return fName; + } + + /** + * @param providerName + */ + private void setProvidername(String providerName) { + fProviderName = providerName; + } + + /** + * @param name + */ + private void setName(String name) { + fName = name; + } + + /** + * @param version + */ + private void setVersion(String version) { + fVersion = version; + } + + /** + * Returns the fully qualified name of the class which implements the abstarct + * <code>Plugin</code> class. + * + * @return the name of this plug-in's runtime class or <code>null</code>. + */ + public String getPluginClass() { + return fPluginClass; + } + + /** + * Returns the unique identifier of the plug-in or <code>null</code>. + * + * @return String + */ + public String getPluginId() { + return fPluginId; + } + + /** + * Returns an array of extensions. + * + * @return Exception[] + */ + public Extension[] getExtensions() { + return fExtensions.toArray(new Extension[fExtensions.size()]); + } + + /** + * Adds a extension. + * + * @param pExtension + */ + public void addExtension(Extension pExtension) { + fExtensions.add(pExtension); + } + + /** + * Sets the pluginClass. + * + * @param pluginClass + * The pluginClass to set + */ + private void setPluginClass(String pluginClass) { + fPluginClass = pluginClass; + } + + /** + * Sets the plugin Id. + * + * @param pluginId + * The pluginId to set + */ + private void setPluginId(String pluginId) { + fPluginId = pluginId; + } + + /** + * Adds a extension point. + * + * @param extensionPoint + */ + public void addExtensionPoint(ExtensionPoint extensionPoint) { + fExtensionPoints.add(extensionPoint); + } + + /** + * Returns a array of extension points. + * + * @return ExtensionPoint[] + */ + public ExtensionPoint[] getExtenstionPoints() { + return fExtensionPoints + .toArray(new ExtensionPoint[fExtensionPoints.size()]); + } + + /** + * Returns a array of plugin ids. + * + * @return String[] + */ + public String[] getDependencies() { + return fDependencies.toArray(new String[fDependencies.size()]); + } + + /** + * Adds a dependency + * + * @param pId + * id of the dependent plugin + */ + public void addDependency(String pId) { + fDependencies.add(pId); + } + + /** + * Adds a exported library with a relative path to the plugin directory. We + * automatically escape characters that are illegal in URLs. It is recommended + * that code converts an abstract pathname into a URL by first converting it + * into a URI, via the toURI method, and then converting the URI into a URL + * via the URI.toURL method. + * + * @param pLibPath + */ + public void addExportedLibRelative(String pLibPath) + throws MalformedURLException { + URI uri = new File(getPluginPath() + File.separator + pLibPath).toURI(); + URL url = uri.toURL(); + fExportedLibs.add(url); + } + + /** + * Returns the directory path of the plugin. + * + * @return String + */ + public String getPluginPath() { + return fPluginPath; + } + + /** + * Returns a array exported librareis as URLs + * + * @return URL[] + */ + public URL[] getExportedLibUrls() { + return fExportedLibs.toArray(new URL[0]); + } + + /** + * Adds a exported library with a relative path to the plugin directory. We + * automatically escape characters that are illegal in URLs. It is recommended + * that code converts an abstract pathname into a URL by first converting it + * into a URI, via the toURI method, and then converting the URI into a URL + * via the URI.toURL method. + * + * @param pLibPath + */ + public void addNotExportedLibRelative(String pLibPath) + throws MalformedURLException { + URI uri = new File(getPluginPath() + File.separator + pLibPath).toURI(); + URL url = uri.toURL(); + fNotExportedLibs.add(url); + } + + /** + * Returns a array of libraries as URLs that are not exported by the plugin. + * + * @return URL[] + */ + public URL[] getNotExportedLibUrls() { + return fNotExportedLibs.toArray(new URL[fNotExportedLibs.size()]); + } + + /** + * Returns a cached classloader for a plugin. Until classloader creation all + * needed libraries are collected. A classloader use as first the plugins own + * libraries and add then all exported libraries of dependend plugins. + * + * @return PluginClassLoader the classloader for the plugin + */ + public PluginClassLoader getClassLoader() { + if (fClassLoader != null) + return fClassLoader; + ArrayList<URL> arrayList = new ArrayList<URL>(); + arrayList.addAll(fExportedLibs); + arrayList.addAll(fNotExportedLibs); + arrayList.addAll(getDependencyLibs()); + File file = new File(getPluginPath()); + try { + for (File file2 : file.listFiles()) { + if (file2.getAbsolutePath().endsWith("properties")) + arrayList.add(file2.getParentFile().toURI().toURL()); + } + } catch (MalformedURLException e) { + LOG.debug(getPluginId() + " " + e.toString()); + } + URL[] urls = arrayList.toArray(new URL[arrayList.size()]); + fClassLoader = new PluginClassLoader(urls, + PluginDescriptor.class.getClassLoader()); + return fClassLoader; + } + + /** + * @return Collection + */ + private ArrayList<URL> getDependencyLibs() { + ArrayList<URL> list = new ArrayList<URL>(); + collectLibs(list, this); + return list; + } + + /** + * @param pLibs + * @param pDescriptor + */ + private void collectLibs(ArrayList<URL> pLibs, PluginDescriptor pDescriptor) { + + for (String id : pDescriptor.getDependencies()) { + PluginDescriptor descriptor = PluginRepository.get(fConf) + .getPluginDescriptor(id); + for (URL url : descriptor.getExportedLibUrls()) { + pLibs.add(url); + } + collectLibs(pLibs, descriptor); + } + } + + /** + * Returns a I18N'd resource string. The resource bundles could be stored in + * root directory of a plugin in the well know i18n file name conventions. + * + * @param pKey + * @param pLocale + * @return String + * @throws IOException + */ + public String getResourceString(String pKey, Locale pLocale) + throws IOException { + if (fMessages.containsKey(pLocale.toString())) { + ResourceBundle bundle = fMessages.get(pLocale.toString()); + try { + return bundle.getString(pKey); + } catch (MissingResourceException e) { + return '!' + pKey + '!'; + } + } + try { + ResourceBundle res = ResourceBundle.getBundle("messages", pLocale, + getClassLoader()); + return res.getString(pKey); + } catch (MissingResourceException x) { + return '!' + pKey + '!'; + } + } + + public String getProviderName() { + return fProviderName; + } + + public String getVersion() { + return fVersion; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/PluginManifestParser.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/PluginManifestParser.java b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginManifestParser.java new file mode 100644 index 0000000..bd2a490 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginManifestParser.java @@ -0,0 +1,303 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +import java.io.File; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLDecoder; +import java.util.HashMap; +import java.util.Map; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.slf4j.Logger; + +import org.apache.hadoop.conf.Configuration; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +/** + * The <code>PluginManifestParser</code> parser just parse the manifest file in + * all plugin directories. + * + * @author joa23 + */ +public class PluginManifestParser { + private static final String ATTR_NAME = "name"; + private static final String ATTR_CLASS = "class"; + private static final String ATTR_ID = "id"; + + public static final Logger LOG = PluginRepository.LOG; + + private static final boolean WINDOWS = System.getProperty("os.name") + .startsWith("Windows"); + + private Configuration conf; + + private PluginRepository pluginRepository; + + public PluginManifestParser(Configuration conf, + PluginRepository pluginRepository) { + this.conf = conf; + this.pluginRepository = pluginRepository; + } + + /** + * Returns a list of all found plugin descriptors. + * + * @param pluginFolders + * folders to search plugins from + * @return A {@link Map} of all found {@link PluginDescriptor}s. + */ + public Map<String, PluginDescriptor> parsePluginFolder(String[] pluginFolders) { + Map<String, PluginDescriptor> map = new HashMap<String, PluginDescriptor>(); + + if (pluginFolders == null) { + throw new IllegalArgumentException("plugin.folders is not defined"); + } + + for (String name : pluginFolders) { + File directory = getPluginFolder(name); + if (directory == null) { + continue; + } + LOG.info("Plugins: looking in: " + directory.getAbsolutePath()); + for (File oneSubFolder : directory.listFiles()) { + if (oneSubFolder.isDirectory()) { + String manifestPath = oneSubFolder.getAbsolutePath() + File.separator + + "plugin.xml"; + try { + LOG.debug("parsing: " + manifestPath); + PluginDescriptor p = parseManifestFile(manifestPath); + map.put(p.getPluginId(), p); + } catch (Exception e) { + LOG.warn("Error while loading plugin `" + manifestPath + "` " + + e.toString()); + } + } + } + } + return map; + } + + /** + * Return the named plugin folder. If the name is absolute then it is + * returned. Otherwise, for relative names, the classpath is scanned. + */ + public File getPluginFolder(String name) { + File directory = new File(name); + if (!directory.isAbsolute()) { + URL url = PluginManifestParser.class.getClassLoader().getResource(name); + if (url == null && directory.exists() && directory.isDirectory() + && directory.listFiles().length > 0) { + return directory; // relative path that is not in the classpath + } else if (url == null) { + LOG.warn("Plugins: directory not found: " + name); + return null; + } else if (!"file".equals(url.getProtocol())) { + LOG.warn("Plugins: not a file: url. Can't load plugins from: " + url); + return null; + } + String path = url.getPath(); + if (WINDOWS && path.startsWith("/")) // patch a windows bug + path = path.substring(1); + try { + path = URLDecoder.decode(path, "UTF-8"); // decode the url path + } catch (UnsupportedEncodingException e) { + } + directory = new File(path); + } else if (!directory.exists()) { + LOG.warn("Plugins: directory not found: " + name); + return null; + } + return directory; + } + + /** + * @param manifestPath + * @throws ParserConfigurationException + * @throws IOException + * @throws SAXException + * @throws MalformedURLException + */ + private PluginDescriptor parseManifestFile(String pManifestPath) + throws MalformedURLException, SAXException, IOException, + ParserConfigurationException { + Document document = parseXML(new File(pManifestPath).toURI().toURL()); + String pPath = new File(pManifestPath).getParent(); + return parsePlugin(document, pPath); + } + + /** + * @param url + * @return Document + * @throws IOException + * @throws SAXException + * @throws ParserConfigurationException + * @throws DocumentException + */ + private Document parseXML(URL url) throws SAXException, IOException, + ParserConfigurationException { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilder builder = factory.newDocumentBuilder(); + return builder.parse(url.openStream()); + } + + /** + * @param pDocument + * @throws MalformedURLException + */ + private PluginDescriptor parsePlugin(Document pDocument, String pPath) + throws MalformedURLException { + Element rootElement = pDocument.getDocumentElement(); + String id = rootElement.getAttribute(ATTR_ID); + String name = rootElement.getAttribute(ATTR_NAME); + String version = rootElement.getAttribute("version"); + String providerName = rootElement.getAttribute("provider-name"); + String pluginClazz = null; + if (rootElement.getAttribute(ATTR_CLASS).trim().length() > 0) { + pluginClazz = rootElement.getAttribute(ATTR_CLASS); + } + PluginDescriptor pluginDescriptor = new PluginDescriptor(id, version, name, + providerName, pluginClazz, pPath, this.conf); + LOG.debug("plugin: id=" + id + " name=" + name + " version=" + version + + " provider=" + providerName + "class=" + pluginClazz); + parseExtension(rootElement, pluginDescriptor); + parseExtensionPoints(rootElement, pluginDescriptor); + parseLibraries(rootElement, pluginDescriptor); + parseRequires(rootElement, pluginDescriptor); + return pluginDescriptor; + } + + /** + * @param pRootElement + * @param pDescriptor + * @throws MalformedURLException + */ + private void parseRequires(Element pRootElement, PluginDescriptor pDescriptor) + throws MalformedURLException { + + NodeList nodelist = pRootElement.getElementsByTagName("requires"); + if (nodelist.getLength() > 0) { + + Element requires = (Element) nodelist.item(0); + + NodeList imports = requires.getElementsByTagName("import"); + for (int i = 0; i < imports.getLength(); i++) { + Element anImport = (Element) imports.item(i); + String plugin = anImport.getAttribute("plugin"); + if (plugin != null) { + pDescriptor.addDependency(plugin); + } + } + } + } + + /** + * @param pRootElement + * @param pDescriptor + * @throws MalformedURLException + */ + private void parseLibraries(Element pRootElement, PluginDescriptor pDescriptor) + throws MalformedURLException { + NodeList nodelist = pRootElement.getElementsByTagName("runtime"); + if (nodelist.getLength() > 0) { + + Element runtime = (Element) nodelist.item(0); + + NodeList libraries = runtime.getElementsByTagName("library"); + for (int i = 0; i < libraries.getLength(); i++) { + Element library = (Element) libraries.item(i); + String libName = library.getAttribute(ATTR_NAME); + NodeList list = library.getElementsByTagName("export"); + Element exportElement = (Element) list.item(0); + if (exportElement != null) + pDescriptor.addExportedLibRelative(libName); + else + pDescriptor.addNotExportedLibRelative(libName); + } + } + } + + /** + * @param rootElement + * @param pluginDescriptor + */ + private void parseExtensionPoints(Element pRootElement, + PluginDescriptor pPluginDescriptor) { + NodeList list = pRootElement.getElementsByTagName("extension-point"); + if (list != null) { + for (int i = 0; i < list.getLength(); i++) { + Element oneExtensionPoint = (Element) list.item(i); + String id = oneExtensionPoint.getAttribute(ATTR_ID); + String name = oneExtensionPoint.getAttribute(ATTR_NAME); + String schema = oneExtensionPoint.getAttribute("schema"); + ExtensionPoint extensionPoint = new ExtensionPoint(id, name, schema); + pPluginDescriptor.addExtensionPoint(extensionPoint); + } + } + } + + /** + * @param rootElement + * @param pluginDescriptor + */ + private void parseExtension(Element pRootElement, + PluginDescriptor pPluginDescriptor) { + NodeList extensions = pRootElement.getElementsByTagName("extension"); + if (extensions != null) { + for (int i = 0; i < extensions.getLength(); i++) { + Element oneExtension = (Element) extensions.item(i); + String pointId = oneExtension.getAttribute("point"); + + NodeList extensionImplementations = oneExtension.getChildNodes(); + if (extensionImplementations != null) { + for (int j = 0; j < extensionImplementations.getLength(); j++) { + Node node = extensionImplementations.item(j); + if (!node.getNodeName().equals("implementation")) { + continue; + } + Element oneImplementation = (Element) node; + String id = oneImplementation.getAttribute(ATTR_ID); + String extensionClass = oneImplementation.getAttribute(ATTR_CLASS); + LOG.debug("impl: point=" + pointId + " class=" + extensionClass); + Extension extension = new Extension(pPluginDescriptor, pointId, id, + extensionClass, this.conf, this.pluginRepository); + NodeList parameters = oneImplementation + .getElementsByTagName("parameter"); + if (parameters != null) { + for (int k = 0; k < parameters.getLength(); k++) { + Element param = (Element) parameters.item(k); + extension.addAttribute(param.getAttribute(ATTR_NAME), + param.getAttribute("value")); + } + } + pPluginDescriptor.addExtension(extension); + } + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRepository.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRepository.java b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRepository.java new file mode 100644 index 0000000..3e19345 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRepository.java @@ -0,0 +1,523 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +import java.lang.reflect.Array; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.WeakHashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.ObjectCache; + +/** + * The plugin repositority is a registry of all plugins. + * + * At system boot up a repositority is builded by parsing the mainifest files of + * all plugins. Plugins that require not existing other plugins are not + * registed. For each plugin a plugin descriptor instance will be created. The + * descriptor represents all meta information about a plugin. So a plugin + * instance will be created later when it is required, this allow lazy plugin + * loading. + */ +public class PluginRepository { + private static final WeakHashMap<String, PluginRepository> CACHE = new WeakHashMap<String, PluginRepository>(); + + private boolean auto; + + private List<PluginDescriptor> fRegisteredPlugins; + + private HashMap<String, ExtensionPoint> fExtensionPoints; + + private HashMap<String, Plugin> fActivatedPlugins; + + private static final Map<String, Map<PluginClassLoader, Class>> CLASS_CACHE = new HashMap<String, Map<PluginClassLoader, Class>>(); + + private Configuration conf; + + public static final Logger LOG = LoggerFactory + .getLogger(PluginRepository.class); + + /** + * @throws PluginRuntimeException + * @see java.lang.Object#Object() + */ + public PluginRepository(Configuration conf) throws RuntimeException { + fActivatedPlugins = new HashMap<String, Plugin>(); + fExtensionPoints = new HashMap<String, ExtensionPoint>(); + this.conf = new Configuration(conf); + this.auto = conf.getBoolean("plugin.auto-activation", true); + String[] pluginFolders = conf.getStrings("plugin.folders"); + PluginManifestParser manifestParser = new PluginManifestParser(this.conf, + this); + Map<String, PluginDescriptor> allPlugins = manifestParser + .parsePluginFolder(pluginFolders); + if (allPlugins.isEmpty()) { + LOG.warn("No plugins found on paths of property plugin.folders=\"{}\"", + conf.get("plugin.folders")); + } + Pattern excludes = Pattern.compile(conf.get("plugin.excludes", "")); + Pattern includes = Pattern.compile(conf.get("plugin.includes", "")); + Map<String, PluginDescriptor> filteredPlugins = filter(excludes, includes, + allPlugins); + fRegisteredPlugins = getDependencyCheckedPlugins(filteredPlugins, + this.auto ? allPlugins : filteredPlugins); + installExtensionPoints(fRegisteredPlugins); + try { + installExtensions(fRegisteredPlugins); + } catch (PluginRuntimeException e) { + LOG.error(e.toString()); + throw new RuntimeException(e.getMessage()); + } + displayStatus(); + } + + /** + * @return a cached instance of the plugin repository + */ + public static synchronized PluginRepository get(Configuration conf) { + String uuid = NutchConfiguration.getUUID(conf); + if (uuid == null) { + uuid = "nonNutchConf@" + conf.hashCode(); // fallback + } + PluginRepository result = CACHE.get(uuid); + if (result == null) { + result = new PluginRepository(conf); + CACHE.put(uuid, result); + } + return result; + } + + private void installExtensionPoints(List<PluginDescriptor> plugins) { + if (plugins == null) { + return; + } + + for (PluginDescriptor plugin : plugins) { + for (ExtensionPoint point : plugin.getExtenstionPoints()) { + String xpId = point.getId(); + LOG.debug("Adding extension point " + xpId); + fExtensionPoints.put(xpId, point); + } + } + } + + /** + * @param pRegisteredPlugins + */ + private void installExtensions(List<PluginDescriptor> pRegisteredPlugins) + throws PluginRuntimeException { + + for (PluginDescriptor descriptor : pRegisteredPlugins) { + for (Extension extension : descriptor.getExtensions()) { + String xpId = extension.getTargetPoint(); + ExtensionPoint point = getExtensionPoint(xpId); + if (point == null) { + throw new PluginRuntimeException("Plugin (" + + descriptor.getPluginId() + "), " + "extension point: " + xpId + + " does not exist."); + } + point.addExtension(extension); + } + } + } + + private void getPluginCheckedDependencies(PluginDescriptor plugin, + Map<String, PluginDescriptor> plugins, + Map<String, PluginDescriptor> dependencies, + Map<String, PluginDescriptor> branch) throws MissingDependencyException, + CircularDependencyException { + + if (dependencies == null) { + dependencies = new HashMap<String, PluginDescriptor>(); + } + if (branch == null) { + branch = new HashMap<String, PluginDescriptor>(); + } + branch.put(plugin.getPluginId(), plugin); + + // Otherwise, checks each dependency + for (String id : plugin.getDependencies()) { + PluginDescriptor dependency = plugins.get(id); + if (dependency == null) { + throw new MissingDependencyException("Missing dependency " + id + + " for plugin " + plugin.getPluginId()); + } + if (branch.containsKey(id)) { + throw new CircularDependencyException("Circular dependency detected " + + id + " for plugin " + plugin.getPluginId()); + } + dependencies.put(id, dependency); + getPluginCheckedDependencies(plugins.get(id), plugins, dependencies, + branch); + } + + branch.remove(plugin.getPluginId()); + } + + private Map<String, PluginDescriptor> getPluginCheckedDependencies( + PluginDescriptor plugin, Map<String, PluginDescriptor> plugins) + throws MissingDependencyException, CircularDependencyException { + Map<String, PluginDescriptor> dependencies = new HashMap<String, PluginDescriptor>(); + Map<String, PluginDescriptor> branch = new HashMap<String, PluginDescriptor>(); + getPluginCheckedDependencies(plugin, plugins, dependencies, branch); + return dependencies; + } + + /** + * @param filtered + * is the list of plugin filtred + * @param all + * is the list of all plugins found. + * @return List + */ + private List<PluginDescriptor> getDependencyCheckedPlugins( + Map<String, PluginDescriptor> filtered, Map<String, PluginDescriptor> all) { + if (filtered == null) { + return null; + } + Map<String, PluginDescriptor> checked = new HashMap<String, PluginDescriptor>(); + + for (PluginDescriptor plugin : filtered.values()) { + try { + checked.putAll(getPluginCheckedDependencies(plugin, all)); + checked.put(plugin.getPluginId(), plugin); + } catch (MissingDependencyException mde) { + // Logger exception and ignore plugin + LOG.warn(mde.getMessage()); + } catch (CircularDependencyException cde) { + // Simply ignore this plugin + LOG.warn(cde.getMessage()); + } + } + return new ArrayList<PluginDescriptor>(checked.values()); + } + + /** + * Returns all registed plugin descriptors. + * + * @return PluginDescriptor[] + */ + public PluginDescriptor[] getPluginDescriptors() { + return fRegisteredPlugins.toArray(new PluginDescriptor[fRegisteredPlugins + .size()]); + } + + /** + * Returns the descriptor of one plugin identified by a plugin id. + * + * @param pPluginId + * @return PluginDescriptor + */ + public PluginDescriptor getPluginDescriptor(String pPluginId) { + + for (PluginDescriptor descriptor : fRegisteredPlugins) { + if (descriptor.getPluginId().equals(pPluginId)) + return descriptor; + } + return null; + } + + /** + * Returns a extension point indentified by a extension point id. + * + * @param pXpId + * @return a extentsion point + */ + public ExtensionPoint getExtensionPoint(String pXpId) { + return this.fExtensionPoints.get(pXpId); + } + + /** + * Returns a instance of a plugin. Plugin instances are cached. So a plugin + * exist only as one instance. This allow a central management of plugin own + * resources. + * + * After creating the plugin instance the startUp() method is invoked. The + * plugin use a own classloader that is used as well by all instance of + * extensions of the same plugin. This class loader use all exported libraries + * from the dependend plugins and all plugin libraries. + * + * @param pDescriptor + * @return Plugin + * @throws PluginRuntimeException + */ + public Plugin getPluginInstance(PluginDescriptor pDescriptor) + throws PluginRuntimeException { + if (fActivatedPlugins.containsKey(pDescriptor.getPluginId())) + return fActivatedPlugins.get(pDescriptor.getPluginId()); + try { + // Must synchronize here to make sure creation and initialization + // of a plugin instance are done by one and only one thread. + // The same is in Extension.getExtensionInstance(). + // Suggested by Stefan Groschupf <[email protected]> + synchronized (pDescriptor) { + Class<?> pluginClass = getCachedClass(pDescriptor, + pDescriptor.getPluginClass()); + Constructor<?> constructor = pluginClass.getConstructor(new Class<?>[] { + PluginDescriptor.class, Configuration.class }); + Plugin plugin = (Plugin) constructor.newInstance(new Object[] { + pDescriptor, this.conf }); + plugin.startUp(); + fActivatedPlugins.put(pDescriptor.getPluginId(), plugin); + return plugin; + } + } catch (ClassNotFoundException e) { + throw new PluginRuntimeException(e); + } catch (InstantiationException e) { + throw new PluginRuntimeException(e); + } catch (IllegalAccessException e) { + throw new PluginRuntimeException(e); + } catch (NoSuchMethodException e) { + throw new PluginRuntimeException(e); + } catch (InvocationTargetException e) { + throw new PluginRuntimeException(e); + } + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#finalize() + */ + public void finalize() throws Throwable { + shutDownActivatedPlugins(); + } + + /** + * Shuts down all plugins + * + * @throws PluginRuntimeException + */ + private void shutDownActivatedPlugins() throws PluginRuntimeException { + for (Plugin plugin : fActivatedPlugins.values()) { + plugin.shutDown(); + } + } + + public Class getCachedClass(PluginDescriptor pDescriptor, String className) + throws ClassNotFoundException { + Map<PluginClassLoader, Class> descMap = CLASS_CACHE.get(className); + if (descMap == null) { + descMap = new HashMap<PluginClassLoader, Class>(); + CLASS_CACHE.put(className, descMap); + } + PluginClassLoader loader = pDescriptor.getClassLoader(); + Class clazz = descMap.get(loader); + if (clazz == null) { + clazz = loader.loadClass(className); + descMap.put(loader, clazz); + } + return clazz; + } + + private void displayStatus() { + LOG.info("Plugin Auto-activation mode: [" + this.auto + "]"); + LOG.info("Registered Plugins:"); + + if ((fRegisteredPlugins == null) || (fRegisteredPlugins.size() == 0)) { + LOG.info("\tNONE"); + } else { + for (PluginDescriptor plugin : fRegisteredPlugins) { + LOG.info("\t" + plugin.getName() + " (" + plugin.getPluginId() + ")"); + } + } + + LOG.info("Registered Extension-Points:"); + if ((fExtensionPoints == null) || (fExtensionPoints.size() == 0)) { + LOG.info("\tNONE"); + } else { + for (ExtensionPoint ep : fExtensionPoints.values()) { + LOG.info("\t" + ep.getName() + " (" + ep.getId() + ")"); + } + } + } + + /** + * Filters a list of plugins. The list of plugins is filtered regarding the + * configuration properties <code>plugin.excludes</code> and + * <code>plugin.includes</code>. + * + * @param excludes + * @param includes + * @param plugins + * Map of plugins + * @return map of plugins matching the configuration + */ + private Map<String, PluginDescriptor> filter(Pattern excludes, + Pattern includes, Map<String, PluginDescriptor> plugins) { + + Map<String, PluginDescriptor> map = new HashMap<String, PluginDescriptor>(); + + if (plugins == null) { + return map; + } + + for (PluginDescriptor plugin : plugins.values()) { + + if (plugin == null) { + continue; + } + String id = plugin.getPluginId(); + if (id == null) { + continue; + } + + if (!includes.matcher(id).matches()) { + LOG.debug("not including: " + id); + continue; + } + if (excludes.matcher(id).matches()) { + LOG.debug("excluding: " + id); + continue; + } + map.put(plugin.getPluginId(), plugin); + } + return map; + } + + /** + * Get ordered list of plugins. Filter and normalization plugins are applied + * in a configurable "pipeline" order, e.g., if one plugin depends on the + * output of another plugin. This method loads the plugins in the order + * defined by orderProperty. If orderProperty is empty or unset, all active + * plugins of the given interface and extension point are loaded. + * + * @param clazz + * interface class implemented by required plugins + * @param xPointId + * extension point id of required plugins + * @param orderProperty + * property name defining plugin order + * @return array of plugin instances + */ + public synchronized Object[] getOrderedPlugins(Class<?> clazz, + String xPointId, String orderProperty) { + Object[] filters; + ObjectCache objectCache = ObjectCache.get(conf); + filters = (Object[]) objectCache.getObject(clazz.getName()); + + if (filters == null) { + String order = conf.get(orderProperty); + List<String> orderOfFilters = new ArrayList<String>(); + boolean userDefinedOrder = false; + if (order != null && !order.trim().isEmpty()) { + orderOfFilters = Arrays.asList(order.trim().split("\\s+")); + userDefinedOrder = true; + } + + try { + ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( + xPointId); + if (point == null) + throw new RuntimeException(xPointId + " not found."); + Extension[] extensions = point.getExtensions(); + HashMap<String, Object> filterMap = new HashMap<String, Object>(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + Object filter = extension.getExtensionInstance(); + if (!filterMap.containsKey(filter.getClass().getName())) { + filterMap.put(filter.getClass().getName(), filter); + if (!userDefinedOrder) + orderOfFilters.add(filter.getClass().getName()); + } + } + List<Object> sorted = new ArrayList<Object>(); + for (String orderedFilter : orderOfFilters) { + Object f = filterMap.get(orderedFilter); + if (f == null) { + LOG.error(clazz.getSimpleName() + " : " + orderedFilter + + " declared in configuration property " + orderProperty + + " but not found in an active plugin - ignoring."); + continue; + } + sorted.add(f); + } + Object[] filter = (Object[]) Array.newInstance(clazz, sorted.size()); + for (int i = 0; i < sorted.size(); i++) { + filter[i] = sorted.get(i); + if (LOG.isTraceEnabled()) { + LOG.trace(clazz.getSimpleName() + " : filters[" + i + "] = " + + filter[i].getClass()); + } + } + objectCache.setObject(clazz.getName(), filter); + } catch (PluginRuntimeException e) { + throw new RuntimeException(e); + } + + filters = (Object[]) objectCache.getObject(clazz.getName()); + } + return filters; + } + + /** + * Loads all necessary dependencies for a selected plugin, and then runs one + * of the classes' main() method. + * + * @param args + * plugin ID (needs to be activated in the configuration), and the + * class name. The rest of arguments is passed to the main method of + * the selected class. + * @throws Exception + */ + public static void main(String[] args) throws Exception { + if (args.length < 2) { + System.err + .println("Usage: PluginRepository pluginId className [arg1 arg2 ...]"); + return; + } + Configuration conf = NutchConfiguration.create(); + PluginRepository repo = new PluginRepository(conf); + // args[0] - plugin ID + PluginDescriptor d = repo.getPluginDescriptor(args[0]); + if (d == null) { + System.err.println("Plugin '" + args[0] + "' not present or inactive."); + return; + } + ClassLoader cl = d.getClassLoader(); + // args[1] - class name + Class<?> clazz = null; + try { + clazz = Class.forName(args[1], true, cl); + } catch (Exception e) { + System.err.println("Could not load the class '" + args[1] + ": " + + e.getMessage()); + return; + } + Method m = null; + try { + m = clazz.getMethod("main", new Class<?>[] { args.getClass() }); + } catch (Exception e) { + System.err.println("Could not find the 'main(String[])' method in class " + + args[1] + ": " + e.getMessage()); + return; + } + String[] subargs = new String[args.length - 2]; + System.arraycopy(args, 2, subargs, 0, subargs.length); + m.invoke(null, new Object[] { subargs }); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRuntimeException.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRuntimeException.java b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRuntimeException.java new file mode 100644 index 0000000..acccda2 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRuntimeException.java @@ -0,0 +1,37 @@ +/* +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +/** + * <code>PluginRuntimeException</code> will be thrown until a exception in the + * plugin managemnt occurs. + * + * @author joa23 + */ +public class PluginRuntimeException extends Exception { + + private static final long serialVersionUID = 1L; + + public PluginRuntimeException(Throwable cause) { + super(cause); + } + + public PluginRuntimeException(String message) { + super(message); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/package.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/package.html b/nutch-core/src/main/java/org/apache/nutch/plugin/package.html new file mode 100644 index 0000000..5ca4c9e --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/plugin/package.html @@ -0,0 +1,40 @@ +<html> +<body> +The Nutch {@link org.apache.nutch.plugin.Pluggable Plugin} System. +<p> +<b>The Nutch Plugin System provides a way to extend nutch functionality</b>. +A large part of the functionality of Nutch are provided by plugins: +All of the parsing, indexing and searching that nutch does is actually +accomplished by various plugins. +</p><p> +In writing a plugin, you're actually providing one or more extensions of the +existing extension-points (<i>hooks</i>). +The core Nutch extension-points are themselves defined in a plugin, +the <code>nutch-extensionpoints</code> plugin. +Each extension-point defines an interface that must be implemented by the +extension. The core extension-points and extensions available in Nutch are +listed in the {@link org.apache.nutch.plugin.Pluggable} interface. +</p> + +@see <a href="./doc-files/plugin.dtd">Nutch plugin manifest DTD</a> + +@see <a href="http://wiki.apache.org/nutch/PluginCentral"> + Plugin Central + </a> +@see <a href="http://wiki.apache.org/nutch/AboutPlugins"> + About Plugins + </a> +@see <a href="http://wiki.apache.org/nutch/WhyNutchHasAPluginSystem"> + Why Nutch has a Plugin System? + </a> +@see <a href="http://wiki.apache.org/nutch/WhichTechnicalConceptsAreBehindTheNutchPluginSystem"> + Which technical concepts are behind the nutch plugin system? + </a> +@see <a href="http://wiki.apache.org/nutch/WhatsTheProblemWithPluginsAndClass-loading"> + What's the problem with Plugins and Class loading? + </a> +@see <a href="http://wiki.apache.org/nutch/WritingPluginExample"> + Writing Plugin Example + </a> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/protocol/Content.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/protocol/Content.java b/nutch-core/src/main/java/org/apache/nutch/protocol/Content.java new file mode 100755 index 0000000..4dc8277 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/protocol/Content.java @@ -0,0 +1,296 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol; + +//JDK imports +import java.io.ByteArrayInputStream; +import java.io.DataInput; +import java.io.DataInputStream; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Arrays; +import java.util.zip.InflaterInputStream; + +//Hadoop imports +import org.apache.commons.cli.Options; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.ArrayFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VersionMismatchException; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.util.GenericOptionsParser; + +//Nutch imports +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.util.MimeUtil; +import org.apache.nutch.util.NutchConfiguration; + +public final class Content implements Writable { + + public static final String DIR_NAME = "content"; + + private final static int VERSION = -1; + + private int version; + + private String url; + + private String base; + + private byte[] content; + + private String contentType; + + private Metadata metadata; + + private MimeUtil mimeTypes; + + public Content() { + metadata = new Metadata(); + } + + public Content(String url, String base, byte[] content, String contentType, + Metadata metadata, Configuration conf) { + + if (url == null) + throw new IllegalArgumentException("null url"); + if (base == null) + throw new IllegalArgumentException("null base"); + if (content == null) + throw new IllegalArgumentException("null content"); + if (metadata == null) + throw new IllegalArgumentException("null metadata"); + + this.url = url; + this.base = base; + this.content = content; + this.metadata = metadata; + + this.mimeTypes = new MimeUtil(conf); + this.contentType = getContentType(contentType, url, content); + } + + private final void readFieldsCompressed(DataInput in) throws IOException { + byte oldVersion = in.readByte(); + switch (oldVersion) { + case 0: + case 1: + url = Text.readString(in); // read url + base = Text.readString(in); // read base + + content = new byte[in.readInt()]; // read content + in.readFully(content); + + contentType = Text.readString(in); // read contentType + // reconstruct metadata + int keySize = in.readInt(); + String key; + for (int i = 0; i < keySize; i++) { + key = Text.readString(in); + int valueSize = in.readInt(); + for (int j = 0; j < valueSize; j++) { + metadata.add(key, Text.readString(in)); + } + } + break; + case 2: + url = Text.readString(in); // read url + base = Text.readString(in); // read base + + content = new byte[in.readInt()]; // read content + in.readFully(content); + + contentType = Text.readString(in); // read contentType + metadata.readFields(in); // read meta data + break; + default: + throw new VersionMismatchException((byte) 2, oldVersion); + } + + } + + public final void readFields(DataInput in) throws IOException { + metadata.clear(); + int sizeOrVersion = in.readInt(); + if (sizeOrVersion < 0) { // version + version = sizeOrVersion; + switch (version) { + case VERSION: + url = Text.readString(in); + base = Text.readString(in); + + content = new byte[in.readInt()]; + in.readFully(content); + + contentType = Text.readString(in); + metadata.readFields(in); + break; + default: + throw new VersionMismatchException((byte) VERSION, (byte) version); + } + } else { // size + byte[] compressed = new byte[sizeOrVersion]; + in.readFully(compressed, 0, compressed.length); + ByteArrayInputStream deflated = new ByteArrayInputStream(compressed); + DataInput inflater = new DataInputStream( + new InflaterInputStream(deflated)); + readFieldsCompressed(inflater); + } + } + + public final void write(DataOutput out) throws IOException { + out.writeInt(VERSION); + + Text.writeString(out, url); // write url + Text.writeString(out, base); // write base + + out.writeInt(content.length); // write content + out.write(content); + + Text.writeString(out, contentType); // write contentType + + metadata.write(out); // write metadata + } + + public static Content read(DataInput in) throws IOException { + Content content = new Content(); + content.readFields(in); + return content; + } + + // + // Accessor methods + // + + /** The url fetched. */ + public String getUrl() { + return url; + } + + /** + * The base url for relative links contained in the content. Maybe be + * different from url if the request redirected. + */ + public String getBaseUrl() { + return base; + } + + /** The binary content retrieved. */ + public byte[] getContent() { + return content; + } + + public void setContent(byte[] content) { + this.content = content; + } + + /** + * The media type of the retrieved content. + * + * @see <a href="http://www.iana.org/assignments/media-types/"> + * http://www.iana.org/assignments/media-types/</a> + */ + public String getContentType() { + return contentType; + } + + public void setContentType(String contentType) { + this.contentType = contentType; + } + + /** Other protocol-specific data. */ + public Metadata getMetadata() { + return metadata; + } + + /** Other protocol-specific data. */ + public void setMetadata(Metadata metadata) { + this.metadata = metadata; + } + + public boolean equals(Object o) { + if (!(o instanceof Content)) { + return false; + } + Content that = (Content) o; + return this.url.equals(that.url) && this.base.equals(that.base) + && Arrays.equals(this.getContent(), that.getContent()) + && this.contentType.equals(that.contentType) + && this.metadata.equals(that.metadata); + } + + public String toString() { + StringBuffer buffer = new StringBuffer(); + + buffer.append("Version: " + version + "\n"); + buffer.append("url: " + url + "\n"); + buffer.append("base: " + base + "\n"); + buffer.append("contentType: " + contentType + "\n"); + buffer.append("metadata: " + metadata + "\n"); + buffer.append("Content:\n"); + buffer.append(new String(content)); // try default encoding + + return buffer.toString(); + + } + + public static void main(String argv[]) throws Exception { + + String usage = "Content (-local | -dfs <namenode:port>) recno segment"; + + if (argv.length < 3) { + System.out.println("usage:" + usage); + return; + } + Options opts = new Options(); + Configuration conf = NutchConfiguration.create(); + + GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv); + + String[] remainingArgs = parser.getRemainingArgs(); + FileSystem fs = FileSystem.get(conf); + + try { + int recno = Integer.parseInt(remainingArgs[0]); + String segment = remainingArgs[1]; + + Path file = new Path(segment, DIR_NAME); + System.out.println("Reading from file: " + file); + + ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(), + conf); + + Content content = new Content(); + contents.get(recno, content); + System.out.println("Retrieved " + recno + " from file " + file); + + System.out.println(content); + + contents.close(); + } finally { + fs.close(); + } + } + + private String getContentType(String typeName, String url, byte[] data) { + return this.mimeTypes.autoResolveContentType(typeName, url, data); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/protocol/Protocol.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/protocol/Protocol.java b/nutch-core/src/main/java/org/apache/nutch/protocol/Protocol.java new file mode 100755 index 0000000..0aa5d29 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/protocol/Protocol.java @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol; + +// Hadoop imports +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.io.Text; + +// Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.plugin.Pluggable; + +import crawlercommons.robots.BaseRobotRules; + +/** A retriever of url content. Implemented by protocol extensions. */ +public interface Protocol extends Pluggable, Configurable { + /** The name of the extension point. */ + public final static String X_POINT_ID = Protocol.class.getName(); + + /** + * Property name. If in the current configuration this property is set to + * true, protocol implementations should handle "politeness" limits + * internally. If this is set to false, it is assumed that these limits are + * enforced elsewhere, and protocol implementations should not enforce them + * internally. + */ + public final static String CHECK_BLOCKING = "protocol.plugin.check.blocking"; + + /** + * Property name. If in the current configuration this property is set to + * true, protocol implementations should handle robot exclusion rules + * internally. If this is set to false, it is assumed that these limits are + * enforced elsewhere, and protocol implementations should not enforce them + * internally. + */ + public final static String CHECK_ROBOTS = "protocol.plugin.check.robots"; + + /** + * Returns the {@link Content} for a fetchlist entry. + */ + ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum); + + /** + * Retrieve robot rules applicable for this url. + * + * @param url + * url to check + * @param datum + * page datum + * @return robot rules (specific for this url or default), never null + */ + BaseRobotRules getRobotRules(Text url, CrawlDatum datum); +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolException.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolException.java b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolException.java new file mode 100755 index 0000000..fc4add5 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolException.java @@ -0,0 +1,39 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol; + +@SuppressWarnings("serial") +public class ProtocolException extends Exception { + + public ProtocolException() { + super(); + } + + public ProtocolException(String message) { + super(message); + } + + public ProtocolException(String message, Throwable cause) { + super(message, cause); + } + + public ProtocolException(Throwable cause) { + super(cause); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolFactory.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolFactory.java b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolFactory.java new file mode 100644 index 0000000..8a92d60 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolFactory.java @@ -0,0 +1,119 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol; + +import java.net.URL; +import java.net.MalformedURLException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.nutch.plugin.*; +import org.apache.nutch.util.ObjectCache; + +import org.apache.hadoop.conf.Configuration; + +/** + * Creates and caches {@link Protocol} plugins. Protocol plugins should define + * the attribute "protocolName" with the name of the protocol that they + * implement. Configuration object is used for caching. Cache key is constructed + * from appending protocol name (eg. http) to constant + * {@link Protocol#X_POINT_ID}. + */ +public class ProtocolFactory { + + public static final Logger LOG = LoggerFactory + .getLogger(ProtocolFactory.class); + + private ExtensionPoint extensionPoint; + + private Configuration conf; + + public ProtocolFactory(Configuration conf) { + this.conf = conf; + this.extensionPoint = PluginRepository.get(conf).getExtensionPoint( + Protocol.X_POINT_ID); + if (this.extensionPoint == null) { + throw new RuntimeException("x-point " + Protocol.X_POINT_ID + + " not found."); + } + } + + /** + * Returns the appropriate {@link Protocol} implementation for a url. + * + * @param urlString + * Url String + * @return The appropriate {@link Protocol} implementation for a given + * {@link URL}. + * @throws ProtocolNotFound + * when Protocol can not be found for urlString + */ + public synchronized Protocol getProtocol(String urlString) + throws ProtocolNotFound { + ObjectCache objectCache = ObjectCache.get(conf); + try { + URL url = new URL(urlString); + String protocolName = url.getProtocol(); + if (protocolName == null) + throw new ProtocolNotFound(urlString); + + String cacheId = Protocol.X_POINT_ID + protocolName; + Protocol protocol = (Protocol) objectCache.getObject(cacheId); + if (protocol != null) { + return protocol; + } + + Extension extension = findExtension(protocolName); + if (extension == null) { + throw new ProtocolNotFound(protocolName); + } + + protocol = (Protocol) extension.getExtensionInstance(); + objectCache.setObject(cacheId, protocol); + return protocol; + } catch (MalformedURLException e) { + throw new ProtocolNotFound(urlString, e.toString()); + } catch (PluginRuntimeException e) { + throw new ProtocolNotFound(urlString, e.toString()); + } + } + + private Extension findExtension(String name) throws PluginRuntimeException { + + Extension[] extensions = this.extensionPoint.getExtensions(); + + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + + if (contains(name, extension.getAttribute("protocolName"))) + return extension; + } + return null; + } + + boolean contains(String what, String where) { + String parts[] = where.split("[, ]"); + for (int i = 0; i < parts.length; i++) { + if (parts[i].equals(what)) + return true; + } + return false; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolNotFound.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolNotFound.java b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolNotFound.java new file mode 100644 index 0000000..8cadc23 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolNotFound.java @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol; + +@SuppressWarnings("serial") +public class ProtocolNotFound extends ProtocolException { + private String url; + + public ProtocolNotFound(String url) { + this(url, "protocol not found for url=" + url); + } + + public ProtocolNotFound(String url, String message) { + super(message); + this.url = url; + } + + public String getUrl() { + return url; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolOutput.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolOutput.java b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolOutput.java new file mode 100644 index 0000000..c7f0c2c --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolOutput.java @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol; + +/** + * Simple aggregate to pass from protocol plugins both content and protocol + * status. + * + * @author Andrzej Bialecki <[email protected]> + */ +public class ProtocolOutput { + private Content content; + private ProtocolStatus status; + + public ProtocolOutput(Content content, ProtocolStatus status) { + this.content = content; + this.status = status; + } + + public ProtocolOutput(Content content) { + this.content = content; + this.status = ProtocolStatus.STATUS_SUCCESS; + } + + public Content getContent() { + return content; + } + + public void setContent(Content content) { + this.content = content; + } + + public ProtocolStatus getStatus() { + return status; + } + + public void setStatus(ProtocolStatus status) { + this.status = status; + } +}
