Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Thu Jan 29 05:38:59 2015 @@ -38,18 +38,18 @@ import org.apache.nutch.plugin.PluginRep import org.apache.nutch.util.MimeUtil; import org.apache.nutch.util.ObjectCache; - -/** Creates and caches {@link Parser} plugins.*/ +/** Creates and caches {@link Parser} plugins. */ public final class ParserFactory { - + public static final Logger LOG = LoggerFactory.getLogger(ParserFactory.class); - + /** Wildcard for default plugins. */ public static final String DEFAULT_PLUGIN = "*"; - + /** Empty extension list for caching purposes. */ - private final List<Extension> EMPTY_EXTENSION_LIST = Collections.<Extension>emptyList(); - + private final List<Extension> EMPTY_EXTENSION_LIST = Collections + .<Extension> emptyList(); + private Configuration conf; private ExtensionPoint extensionPoint; private ParsePluginList parsePluginList; @@ -57,12 +57,15 @@ public final class ParserFactory { public ParserFactory(Configuration conf) { this.conf = conf; ObjectCache objectCache = ObjectCache.get(conf); - this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(Parser.X_POINT_ID); - this.parsePluginList = (ParsePluginList)objectCache.getObject(ParsePluginList.class.getName()); - + this.extensionPoint = PluginRepository.get(conf).getExtensionPoint( + Parser.X_POINT_ID); + this.parsePluginList = (ParsePluginList) objectCache + .getObject(ParsePluginList.class.getName()); + if (this.parsePluginList == null) { this.parsePluginList = new ParsePluginsReader().parse(conf); - objectCache.setObject(ParsePluginList.class.getName(), this.parsePluginList); + objectCache.setObject(ParsePluginList.class.getName(), + this.parsePluginList); } if (this.extensionPoint == null) { @@ -72,45 +75,46 @@ public final class ParserFactory { throw new RuntimeException( "Parse Plugins preferences could not be loaded."); } - } - - + } + /** * Function returns an array of {@link Parser}s for a given content type. - * + * * The function consults the internal list of parse plugins for the - * ParserFactory to determine the list of pluginIds, then gets the - * appropriate extension points to instantiate as {@link Parser}s. - * - * @param contentType The contentType to return the <code>Array</code> - * of {@link Parser}s for. - * @param url The url for the content that may allow us to get the type from - * the file suffix. + * ParserFactory to determine the list of pluginIds, then gets the appropriate + * extension points to instantiate as {@link Parser}s. + * + * @param contentType + * The contentType to return the <code>Array</code> of {@link Parser} + * s for. + * @param url + * The url for the content that may allow us to get the type from the + * file suffix. * @return An <code>Array</code> of {@link Parser}s for the given contentType. * If there were plugins mapped to a contentType via the - * <code>parse-plugins.xml</code> file, but never enabled via - * the <code>plugin.includes</code> Nutch conf, then those plugins - * won't be part of this array, i.e., they will be skipped. - * So, if the ordered list of parsing plugins for - * <code>text/plain</code> was <code>[parse-text,parse-html, + * <code>parse-plugins.xml</code> file, but never enabled via the + * <code>plugin.includes</code> Nutch conf, then those plugins won't + * be part of this array, i.e., they will be skipped. So, if the + * ordered list of parsing plugins for <code>text/plain</code> was + * <code>[parse-text,parse-html, * parse-rtf]</code>, and only <code>parse-html</code> and * <code>parse-rtf</code> were enabled via - * <code>plugin.includes</code>, then this ordered Array would - * consist of two {@link Parser} interfaces, + * <code>plugin.includes</code>, then this ordered Array would consist + * of two {@link Parser} interfaces, * <code>[parse-html, parse-rtf]</code>. */ public Parser[] getParsers(String contentType, String url) - throws ParserNotFound { - + throws ParserNotFound { + List<Parser> parsers = null; List<Extension> parserExts = null; - + ObjectCache objectCache = ObjectCache.get(conf); - + // TODO once the MimeTypes is available // parsers = getExtensions(MimeUtils.map(contentType)); // if (parsers != null) { - // return parsers; + // return parsers; // } // Last Chance: Guess content-type from file url... // parsers = getExtensions(MimeUtils.getMimeType(url)); @@ -121,50 +125,51 @@ public final class ParserFactory { } parsers = new Vector<Parser>(parserExts.size()); - for (Iterator<Extension> i = parserExts.iterator(); i.hasNext(); ){ + for (Iterator<Extension> i = parserExts.iterator(); i.hasNext();) { Extension ext = i.next(); Parser p = null; try { - //check to see if we've cached this parser instance yet + // check to see if we've cached this parser instance yet p = (Parser) objectCache.getObject(ext.getId()); if (p == null) { // go ahead and instantiate it and then cache it p = (Parser) ext.getExtensionInstance(); - objectCache.setObject(ext.getId(),p); + objectCache.setObject(ext.getId(), p); } parsers.add(p); } catch (PluginRuntimeException e) { if (LOG.isWarnEnabled()) { LOG.warn("ParserFactory:PluginRuntimeException when " - + "initializing parser plugin " - + ext.getDescriptor().getPluginId() - + " instance in getParsers " - + "function: attempting to continue instantiating parsers"); + + "initializing parser plugin " + + ext.getDescriptor().getPluginId() + " instance in getParsers " + + "function: attempting to continue instantiating parsers"); } } } - return parsers.toArray(new Parser[]{}); + return parsers.toArray(new Parser[] {}); } - + /** * Function returns a {@link Parser} instance with the specified - * <code>extId</code>, representing its extension ID. If the Parser - * instance isn't found, then the function throws a - * <code>ParserNotFound</code> exception. If the function is able to find - * the {@link Parser} in the internal <code>PARSER_CACHE</code> then it - * will return the already instantiated Parser. Otherwise, if it has to - * instantiate the Parser itself , then this function will cache that Parser - * in the internal <code>PARSER_CACHE</code>. + * <code>extId</code>, representing its extension ID. If the Parser instance + * isn't found, then the function throws a <code>ParserNotFound</code> + * exception. If the function is able to find the {@link Parser} in the + * internal <code>PARSER_CACHE</code> then it will return the already + * instantiated Parser. Otherwise, if it has to instantiate the Parser itself + * , then this function will cache that Parser in the internal + * <code>PARSER_CACHE</code>. * - * @param id The string extension ID (e.g., - * "org.apache.nutch.parse.rss.RSSParser", - * "org.apache.nutch.parse.rtf.RTFParseFactory") of the {@link Parser} - * implementation to return. + * @param id + * The string extension ID (e.g., + * "org.apache.nutch.parse.rss.RSSParser", + * "org.apache.nutch.parse.rtf.RTFParseFactory") of the + * {@link Parser} implementation to return. * @return A {@link Parser} implementation specified by the parameter * <code>id</code>. - * @throws ParserNotFound If the Parser is not found (i.e., registered with - * the extension point), or if the there a - * {@link PluginRuntimeException} instantiating the {@link Parser}. + * @throws ParserNotFound + * If the Parser is not found (i.e., registered with the extension + * point), or if the there a {@link PluginRuntimeException} + * instantiating the {@link Parser}. */ public Parser getParserById(String id) throws ParserNotFound { @@ -172,7 +177,7 @@ public final class ParserFactory { Extension parserExt = null; ObjectCache objectCache = ObjectCache.get(conf); - + if (id != null) { parserExt = getExtension(extensions, id); } @@ -183,12 +188,12 @@ public final class ParserFactory { if (parserExt == null) { throw new ParserNotFound("No Parser Found for id [" + id + "]"); } - - // first check the cache + + // first check the cache if (objectCache.getObject(parserExt.getId()) != null) { return (Parser) objectCache.getObject(parserExt.getId()); - // if not found in cache, instantiate the Parser + // if not found in cache, instantiate the Parser } else { try { Parser p = (Parser) parserExt.getExtensionInstance(); @@ -196,31 +201,31 @@ public final class ParserFactory { return p; } catch (PluginRuntimeException e) { if (LOG.isWarnEnabled()) { - LOG.warn("Canno initialize parser " + - parserExt.getDescriptor().getPluginId() + - " (cause: " + e.toString()); + LOG.warn("Canno initialize parser " + + parserExt.getDescriptor().getPluginId() + " (cause: " + + e.toString()); } throw new ParserNotFound("Cannot init parser for id [" + id + "]"); } } } - + /** * Finds the best-suited parse plugin for a given contentType. * - * @param contentType Content-Type for which we seek a parse plugin. - * @return a list of extensions to be used for this contentType. - * If none, returns <code>null</code>. + * @param contentType + * Content-Type for which we seek a parse plugin. + * @return a list of extensions to be used for this contentType. If none, + * returns <code>null</code>. */ @SuppressWarnings("unchecked") protected List<Extension> getExtensions(String contentType) { - + ObjectCache objectCache = ObjectCache.get(conf); // First of all, tries to clean the content-type String type = null; type = MimeUtil.cleanMimeType(contentType); - List<Extension> extensions = (List<Extension>) objectCache.getObject(type); // Just compare the reference: @@ -228,100 +233,105 @@ public final class ParserFactory { if (extensions == EMPTY_EXTENSION_LIST) { return null; } - + if (extensions == null) { extensions = findExtensions(type); if (extensions != null) { objectCache.setObject(type, extensions); } else { - // Put the empty extension list into cache - // to remember we don't know any related extension. + // Put the empty extension list into cache + // to remember we don't know any related extension. objectCache.setObject(type, EMPTY_EXTENSION_LIST); } } return extensions; } - + /** * searches a list of suitable parse plugins for the given contentType. - * <p>It first looks for a preferred plugin defined in the parse-plugin - * file. If none is found, it returns a list of default plugins. + * <p> + * It first looks for a preferred plugin defined in the parse-plugin file. If + * none is found, it returns a list of default plugins. * - * @param contentType Content-Type for which we seek a parse plugin. - * @return List - List of extensions to be used for this contentType. - * If none, returns null. + * @param contentType + * Content-Type for which we seek a parse plugin. + * @return List - List of extensions to be used for this contentType. If none, + * returns null. */ private List<Extension> findExtensions(String contentType) { - + Extension[] extensions = this.extensionPoint.getExtensions(); - + // Look for a preferred plugin. - List<String> parsePluginList = - this.parsePluginList.getPluginList(contentType); - List<Extension> extensionList = - matchExtensions(parsePluginList, extensions, contentType); + List<String> parsePluginList = this.parsePluginList + .getPluginList(contentType); + List<Extension> extensionList = matchExtensions(parsePluginList, + extensions, contentType); if (extensionList != null) { return extensionList; } - + // If none found, look for a default plugin. parsePluginList = this.parsePluginList.getPluginList(DEFAULT_PLUGIN); return matchExtensions(parsePluginList, extensions, DEFAULT_PLUGIN); } - + /** * Tries to find a suitable parser for the given contentType. * <ol> - * <li>It checks if a parser which accepts the contentType - * can be found in the <code>plugins</code> list;</li> - * <li>If this list is empty, it tries to find amongst the loaded - * extensions whether some of them might suit and warns the user.</li> + * <li>It checks if a parser which accepts the contentType can be found in the + * <code>plugins</code> list;</li> + * <li>If this list is empty, it tries to find amongst the loaded extensions + * whether some of them might suit and warns the user.</li> * </ol> - * @param plugins List of candidate plugins. - * @param extensions Array of loaded extensions. - * @param contentType Content-Type for which we seek a parse plugin. - * @return List - List of extensions to be used for this contentType. - * If none, returns null. + * + * @param plugins + * List of candidate plugins. + * @param extensions + * Array of loaded extensions. + * @param contentType + * Content-Type for which we seek a parse plugin. + * @return List - List of extensions to be used for this contentType. If none, + * returns null. */ private List<Extension> matchExtensions(List<String> plugins, - Extension[] extensions, - String contentType) { - + Extension[] extensions, String contentType) { + List<Extension> extList = new ArrayList<Extension>(); if (plugins != null) { - + for (String parsePluginId : plugins) { - + Extension ext = getExtension(extensions, parsePluginId, contentType); // the extension returned may be null // that means that it was not enabled in the plugin.includes // nutch conf property, but it was mapped in the // parse-plugins.xml - // file. + // file. // OR it was enabled in plugin.includes, but the plugin's plugin.xml // file does not claim that the plugin supports the specified mimeType // in either case, LOG the appropriate error message to WARN level - + if (ext == null) { - //try to get it just by its pluginId + // try to get it just by its pluginId ext = getExtension(extensions, parsePluginId); - - if (LOG.isWarnEnabled()) { + + if (LOG.isWarnEnabled()) { if (ext != null) { // plugin was enabled via plugin.includes // its plugin.xml just doesn't claim to support that // particular mimeType - LOG.warn("ParserFactory:Plugin: " + parsePluginId + - " mapped to contentType " + contentType + - " via parse-plugins.xml, but " + "its plugin.xml " + - "file does not claim to support contentType: " + - contentType); + LOG.warn("ParserFactory:Plugin: " + parsePluginId + + " mapped to contentType " + contentType + + " via parse-plugins.xml, but " + "its plugin.xml " + + "file does not claim to support contentType: " + + contentType); } else { // plugin wasn't enabled via plugin.includes - LOG.warn("ParserFactory: Plugin: " + parsePluginId + - " mapped to contentType " + contentType + - " via parse-plugins.xml, but not enabled via " + - "plugin.includes in nutch-default.xml"); + LOG.warn("ParserFactory: Plugin: " + parsePluginId + + " mapped to contentType " + contentType + + " via parse-plugins.xml, but not enabled via " + + "plugin.includes in nutch-default.xml"); } } } @@ -331,7 +341,7 @@ public final class ParserFactory { extList.add(ext); } } - + } else { // okay, there were no list of plugins defined for // this mimeType, however, there may be plugins registered @@ -340,75 +350,78 @@ public final class ParserFactory { // so, iterate through the list of extensions and if you find // any extensions where this is the case, throw a // NotMappedParserException - - for (int i=0; i<extensions.length; i++) { - if ("*".equals(extensions[i].getAttribute("contentType"))){ + + for (int i = 0; i < extensions.length; i++) { + if ("*".equals(extensions[i].getAttribute("contentType"))) { extList.add(0, extensions[i]); - } - else if (extensions[i].getAttribute("contentType") != null - && contentType.matches(escapeContentType(extensions[i].getAttribute("contentType")))) { + } else if (extensions[i].getAttribute("contentType") != null + && contentType.matches(escapeContentType(extensions[i] + .getAttribute("contentType")))) { extList.add(extensions[i]); } } - + if (extList.size() > 0) { if (LOG.isInfoEnabled()) { StringBuffer extensionsIDs = new StringBuffer("["); boolean isFirst = true; - for (Extension ext : extList){ - if (!isFirst) extensionsIDs.append(" - "); - else isFirst=false; - extensionsIDs.append(ext.getId()); + for (Extension ext : extList) { + if (!isFirst) + extensionsIDs.append(" - "); + else + isFirst = false; + extensionsIDs.append(ext.getId()); } - extensionsIDs.append("]"); - LOG.info("The parsing plugins: " + extensionsIDs.toString() + - " are enabled via the plugin.includes system " + - "property, and all claim to support the content type " + - contentType + ", but they are not mapped to it in the " + - "parse-plugins.xml file"); + extensionsIDs.append("]"); + LOG.info("The parsing plugins: " + extensionsIDs.toString() + + " are enabled via the plugin.includes system " + + "property, and all claim to support the content type " + + contentType + ", but they are not mapped to it in the " + + "parse-plugins.xml file"); } } else if (LOG.isDebugEnabled()) { - LOG.debug("ParserFactory:No parse plugins mapped or enabled for " + - "contentType " + contentType); + LOG.debug("ParserFactory:No parse plugins mapped or enabled for " + + "contentType " + contentType); } } - + return (extList.size() > 0) ? extList : null; } - + private String escapeContentType(String contentType) { - // Escapes contentType in order to use as a regex - // (and keep backwards compatibility). - // This enables to accept multiple types for a single parser. - return contentType.replace("+", "\\+").replace(".", "\\."); - } + // Escapes contentType in order to use as a regex + // (and keep backwards compatibility). + // This enables to accept multiple types for a single parser. + return contentType.replace("+", "\\+").replace(".", "\\."); + } private boolean match(Extension extension, String id, String type) { - return ((id.equals(extension.getId())) && - (extension.getAttribute("contentType").equals("*") || - type.matches(escapeContentType(extension.getAttribute("contentType"))) || - type.equals(DEFAULT_PLUGIN))); + return ((id.equals(extension.getId())) && (extension.getAttribute( + "contentType").equals("*") + || type + .matches(escapeContentType(extension.getAttribute("contentType"))) || type + .equals(DEFAULT_PLUGIN))); } - + /** Get an extension from its id and supported content-type. */ private Extension getExtension(Extension[] list, String id, String type) { - for (int i=0; i<list.length; i++) { + for (int i = 0; i < list.length; i++) { if (match(list[i], id, type)) { return list[i]; } } return null; } - + private Extension getExtension(Extension[] list, String id) { - for (int i=0; i<list.length; i++) { + for (int i = 0; i < list.length; i++) { if (id.equals(list[i].getId())) { return list[i]; } } return null; } - + private Extension getExtensionFromAlias(Extension[] list, String id) { return getExtension(list, parsePluginList.getAliases().get(id)); }
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java Thu Jan 29 05:38:59 2015 @@ -18,17 +18,17 @@ package org.apache.nutch.parse; public class ParserNotFound extends ParseException { - private static final long serialVersionUID=23993993939L; + private static final long serialVersionUID = 23993993939L; private String url; private String contentType; - public ParserNotFound(String message){ - super(message); + public ParserNotFound(String message) { + super(message); } - + public ParserNotFound(String url, String contentType) { - this(url, contentType, - "parser not found for contentType="+contentType+" url="+url); + this(url, contentType, "parser not found for contentType=" + contentType + + " url=" + url); } public ParserNotFound(String url, String contentType, String message) { @@ -37,6 +37,11 @@ public class ParserNotFound extends Pars this.contentType = contentType; } - public String getUrl() { return url; } - public String getContentType() { return contentType; } + public String getUrl() { + return url; + } + + public String getContentType() { + return contentType; + } } Modified: nutch/trunk/src/java/org/apache/nutch/parse/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/package-info.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/package-info.java Thu Jan 29 05:38:59 2015 @@ -19,3 +19,4 @@ * The {@link org.apache.nutch.parse.Parse Parse} interface and related classes. */ package org.apache.nutch.parse; + Modified: nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java Thu Jan 29 05:38:59 2015 @@ -16,7 +16,6 @@ */ package org.apache.nutch.plugin; - /** * <code>CircularDependencyException</code> will be thrown if a circular * dependency is detected. Modified: nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java Thu Jan 29 05:38:59 2015 @@ -94,8 +94,10 @@ public class Extension { * Adds a attribute and is only used until model creation at plugin system * start up. * - * @param pKey a key - * @param pValue a value + * @param pKey + * a key + * @param pValue + * a value */ public void addAttribute(String pKey, String pValue) { fAttributes.put(pKey, pValue); @@ -105,7 +107,8 @@ public class Extension { * Sets the Class that implement the concret extension and is only used until * model creation at system start up. * - * @param extensionClazz The extensionClasname to set + * @param extensionClazz + * The extensionClasname to set */ public void setClazz(String extensionClazz) { fClazz = extensionClazz; @@ -115,7 +118,8 @@ public class Extension { * Sets the unique extension Id and is only used until model creation at * system start up. * - * @param extensionID The extensionID to set + * @param extensionID + * The extensionID to set */ public void setId(String extensionID) { fId = extensionID; @@ -147,10 +151,10 @@ public class Extension { // The same is in PluginRepository.getPluginInstance(). // Suggested by Stefan Groschupf <[email protected]> synchronized (getId()) { - try { + try { PluginRepository pluginRepository = PluginRepository.get(conf); - Class extensionClazz = - pluginRepository.getCachedClass(fDescriptor, getClazz()); + Class extensionClazz = pluginRepository.getCachedClass(fDescriptor, + getClazz()); // lazy loading of Plugin in case there is no instance of the plugin // already. pluginRepository.getPluginInstance(getDescriptor()); Modified: nutch/trunk/src/java/org/apache/nutch/plugin/ExtensionPoint.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/ExtensionPoint.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/plugin/ExtensionPoint.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/ExtensionPoint.java Thu Jan 29 05:38:59 2015 @@ -15,6 +15,7 @@ * limitations under the License. */ package org.apache.nutch.plugin; + import java.util.ArrayList; /** @@ -76,7 +77,8 @@ public class ExtensionPoint { /** * Sets the extensionPointId. * - * @param pId extension point id + * @param pId + * extension point id */ private void setId(String pId) { ftId = pId; Modified: nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java Thu Jan 29 05:38:59 2015 @@ -17,8 +17,8 @@ package org.apache.nutch.plugin; /** - * <code>MissingDependencyException</code> will be thrown if a plugin - * dependency cannot be found. + * <code>MissingDependencyException</code> will be thrown if a plugin dependency + * cannot be found. * * @author Jérôme Charron */ Modified: nutch/trunk/src/java/org/apache/nutch/plugin/Pluggable.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/Pluggable.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/plugin/Pluggable.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/Pluggable.java Thu Jan 29 05:38:59 2015 @@ -17,16 +17,15 @@ package org.apache.nutch.plugin; /** - * Defines the capability of a class to be plugged into Nutch. - * This is a common interface that must be implemented by all - * Nutch Extension Points. - * + * Defines the capability of a class to be plugged into Nutch. This is a common + * interface that must be implemented by all Nutch Extension Points. + * * @author Jérôme Charron - * + * * @see <a href="http://wiki.apache.org/nutch/AboutPlugins">About Plugins</a> - * @see <a href="package-summary.html#package_description"> - * plugin package description</a> + * @see <a href="package-summary.html#package_description"> plugin package + * description</a> */ public interface Pluggable { - + } Modified: nutch/trunk/src/java/org/apache/nutch/plugin/Plugin.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/Plugin.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/plugin/Plugin.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/Plugin.java Thu Jan 29 05:38:59 2015 @@ -33,8 +33,8 @@ import org.apache.hadoop.conf.Configurat * The <code>Plugin</code> will be startuped and shutdown by the nutch plugin * management system. * - * A possible usecase of the <code>Plugin</code> implementation is to create - * or close a database connection. + * A possible usecase of the <code>Plugin</code> implementation is to create or + * close a database connection. * * @author joa23 */ @@ -81,7 +81,8 @@ public class Plugin { } /** - * @param descriptor The descriptor to set + * @param descriptor + * The descriptor to set */ private void setDescriptor(PluginDescriptor descriptor) { fDescriptor = descriptor; Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java Thu Jan 29 05:38:59 2015 @@ -45,11 +45,11 @@ public class PluginClassLoader extends U */ public PluginClassLoader(URL[] urls, ClassLoader parent) { super(urls, parent); - + this.urls = urls; this.parent = parent; } - + @Override public int hashCode() { final int PRIME = 31; Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java Thu Jan 29 05:38:59 2015 @@ -31,12 +31,11 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; /** - * The <code>PluginDescriptor</code> provide access to all meta information of - * a nutch-plugin, as well to the internationalizable resources and the plugin - * own classloader. There are meta information about <code>Plugin</code>, - * <code>ExtensionPoint</code> and <code>Extension</code>. To provide - * access to the meta data of a plugin via a descriptor allow a lazy loading - * mechanism. + * The <code>PluginDescriptor</code> provide access to all meta information of a + * nutch-plugin, as well to the internationalizable resources and the plugin own + * classloader. There are meta information about <code>Plugin</code>, + * <code>ExtensionPoint</code> and <code>Extension</code>. To provide access to + * the meta data of a plugin via a descriptor allow a lazy loading mechanism. */ public class PluginDescriptor { private String fPluginPath; @@ -52,7 +51,8 @@ public class PluginDescriptor { private ArrayList<URL> fNotExportedLibs = new ArrayList<URL>(); private ArrayList<Extension> fExtensions = new ArrayList<Extension>(); private PluginClassLoader fClassLoader; - public static final Logger LOG = LoggerFactory.getLogger(PluginDescriptor.class); + public static final Logger LOG = LoggerFactory + .getLogger(PluginDescriptor.class); private Configuration fConf; /** @@ -205,18 +205,19 @@ public class PluginDescriptor { /** * Adds a dependency * - * @param pId id of the dependent plugin + * @param pId + * id of the dependent plugin */ public void addDependency(String pId) { fDependencies.add(pId); } /** - * Adds a exported library with a relative path to the plugin directory. - * We automatically escape characters that are illegal in URLs. It is - * recommended that code converts an abstract pathname into a URL by - * first converting it into a URI, via the toURI method, and then - * converting the URI into a URL via the URI.toURL method. + * Adds a exported library with a relative path to the plugin directory. We + * automatically escape characters that are illegal in URLs. It is recommended + * that code converts an abstract pathname into a URL by first converting it + * into a URI, via the toURI method, and then converting the URI into a URL + * via the URI.toURL method. * * @param pLibPath */ @@ -246,11 +247,11 @@ public class PluginDescriptor { } /** - * Adds a exported library with a relative path to the plugin directory. - * We automatically escape characters that are illegal in URLs. It is - * recommended that code converts an abstract pathname into a URL by - * first converting it into a URI, via the toURI method, and then - * converting the URI into a URL via the URI.toURL method. + * Adds a exported library with a relative path to the plugin directory. We + * automatically escape characters that are illegal in URLs. It is recommended + * that code converts an abstract pathname into a URL by first converting it + * into a URI, via the toURI method, and then converting the URI into a URL + * via the URI.toURL method. * * @param pLibPath */ @@ -294,8 +295,8 @@ public class PluginDescriptor { LOG.debug(getPluginId() + " " + e.toString()); } URL[] urls = arrayList.toArray(new URL[arrayList.size()]); - fClassLoader = new PluginClassLoader(urls, PluginDescriptor.class - .getClassLoader()); + fClassLoader = new PluginClassLoader(urls, + PluginDescriptor.class.getClassLoader()); return fClassLoader; } @@ -317,7 +318,7 @@ public class PluginDescriptor { for (String id : pDescriptor.getDependencies()) { PluginDescriptor descriptor = PluginRepository.get(fConf) .getPluginDescriptor(id); - for (URL url: descriptor.getExportedLibUrls()) { + for (URL url : descriptor.getExportedLibUrls()) { pLibs.add(url); } collectLibs(pLibs, descriptor); Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java Thu Jan 29 05:38:59 2015 @@ -40,8 +40,8 @@ import org.w3c.dom.NodeList; import org.xml.sax.SAXException; /** - * The <code>PluginManifestParser</code> parser just parse the manifest file - * in all plugin directories. + * The <code>PluginManifestParser</code> parser just parse the manifest file in + * all plugin directories. * * @author joa23 */ @@ -94,7 +94,8 @@ public class PluginManifestParser { PluginDescriptor p = parseManifestFile(manifestPath); map.put(p.getPluginId(), p); } catch (Exception e) { - LOG.warn("Error while loading plugin `" + manifestPath + "` " + e.toString()); + LOG.warn("Error while loading plugin `" + manifestPath + "` " + + e.toString()); } } } @@ -183,7 +184,7 @@ public class PluginManifestParser { PluginDescriptor pluginDescriptor = new PluginDescriptor(id, version, name, providerName, pluginClazz, pPath, this.conf); LOG.debug("plugin: id=" + id + " name=" + name + " version=" + version - + " provider=" + providerName + "class=" + pluginClazz); + + " provider=" + providerName + "class=" + pluginClazz); parseExtension(rootElement, pluginDescriptor); parseExtensionPoints(rootElement, pluginDescriptor); parseLibraries(rootElement, pluginDescriptor); @@ -290,8 +291,8 @@ public class PluginManifestParser { if (parameters != null) { for (int k = 0; k < parameters.getLength(); k++) { Element param = (Element) parameters.item(k); - extension.addAttribute(param.getAttribute(ATTR_NAME), param - .getAttribute("value")); + extension.addAttribute(param.getAttribute(ATTR_NAME), + param.getAttribute("value")); } } pPluginDescriptor.addExtension(extension); Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java Thu Jan 29 05:38:59 2015 @@ -53,13 +53,13 @@ public class PluginRepository { private HashMap<String, ExtensionPoint> fExtensionPoints; private HashMap<String, Plugin> fActivatedPlugins; - - private static final Map<String, Map<PluginClassLoader, Class>> CLASS_CACHE = - new HashMap<String, Map<PluginClassLoader,Class>>(); + + private static final Map<String, Map<PluginClassLoader, Class>> CLASS_CACHE = new HashMap<String, Map<PluginClassLoader, Class>>(); private Configuration conf; - public static final Logger LOG = LoggerFactory.getLogger(PluginRepository.class); + public static final Logger LOG = LoggerFactory + .getLogger(PluginRepository.class); /** * @throws PluginRuntimeException @@ -71,7 +71,8 @@ public class PluginRepository { this.conf = new Configuration(conf); this.auto = conf.getBoolean("plugin.auto-activation", true); String[] pluginFolders = conf.getStrings("plugin.folders"); - PluginManifestParser manifestParser = new PluginManifestParser(this.conf, this); + PluginManifestParser manifestParser = new PluginManifestParser(this.conf, + this); Map<String, PluginDescriptor> allPlugins = manifestParser .parsePluginFolder(pluginFolders); if (allPlugins.isEmpty()) { @@ -88,7 +89,7 @@ public class PluginRepository { try { installExtensions(fRegisteredPlugins); } catch (PluginRuntimeException e) { - LOG.error(e.toString()); + LOG.error(e.toString()); throw new RuntimeException(e.getMessage()); } displayStatus(); @@ -115,8 +116,8 @@ public class PluginRepository { return; } - for (PluginDescriptor plugin: plugins) { - for(ExtensionPoint point:plugin.getExtenstionPoints()) { + for (PluginDescriptor plugin : plugins) { + for (ExtensionPoint point : plugin.getExtenstionPoints()) { String xpId = point.getId(); LOG.debug("Adding extension point " + xpId); fExtensionPoints.put(xpId, point); @@ -131,7 +132,7 @@ public class PluginRepository { throws PluginRuntimeException { for (PluginDescriptor descriptor : pRegisteredPlugins) { - for(Extension extension:descriptor.getExtensions()) { + for (Extension extension : descriptor.getExtensions()) { String xpId = extension.getTargetPoint(); ExtensionPoint point = getExtensionPoint(xpId); if (point == null) { @@ -159,7 +160,7 @@ public class PluginRepository { branch.put(plugin.getPluginId(), plugin); // Otherwise, checks each dependency - for(String id:plugin.getDependencies()) { + for (String id : plugin.getDependencies()) { PluginDescriptor dependency = plugins.get(id); if (dependency == null) { throw new MissingDependencyException("Missing dependency " + id @@ -274,7 +275,8 @@ public class PluginRepository { // The same is in Extension.getExtensionInstance(). // Suggested by Stefan Groschupf <[email protected]> synchronized (pDescriptor) { - Class<?> pluginClass = getCachedClass(pDescriptor, pDescriptor.getPluginClass()); + Class<?> pluginClass = getCachedClass(pDescriptor, + pDescriptor.getPluginClass()); Constructor<?> constructor = pluginClass.getConstructor(new Class<?>[] { PluginDescriptor.class, Configuration.class }); Plugin plugin = (Plugin) constructor.newInstance(new Object[] { @@ -315,9 +317,9 @@ public class PluginRepository { plugin.shutDown(); } } - + public Class getCachedClass(PluginDescriptor pDescriptor, String className) - throws ClassNotFoundException { + throws ClassNotFoundException { Map<PluginClassLoader, Class> descMap = CLASS_CACHE.get(className); if (descMap == null) { descMap = new HashMap<PluginClassLoader, Class>(); @@ -396,7 +398,7 @@ public class PluginRepository { } return map; } - + /** * Get ordered list of plugins. Filter and normalization plugins are applied * in a configurable "pipeline" order, e.g., if one plugin depends on the @@ -412,8 +414,8 @@ public class PluginRepository { * property name defining plugin order * @return array of plugin instances */ - public synchronized Object[] getOrderedPlugins(Class<?> clazz, String xPointId, - String orderProperty) { + public synchronized Object[] getOrderedPlugins(Class<?> clazz, + String xPointId, String orderProperty) { Object[] filters; ObjectCache objectCache = ObjectCache.get(conf); filters = (Object[]) objectCache.getObject(clazz.getName()); Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginRuntimeException.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRuntimeException.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/plugin/PluginRuntimeException.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginRuntimeException.java Thu Jan 29 05:38:59 2015 @@ -16,6 +16,7 @@ * limitations under the License. */ package org.apache.nutch.plugin; + /** * <code>PluginRuntimeException</code> will be thrown until a exception in the * plugin managemnt occurs. Modified: nutch/trunk/src/java/org/apache/nutch/protocol/Content.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original) +++ nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Thu Jan 29 05:38:59 2015 @@ -42,7 +42,7 @@ import org.apache.nutch.metadata.Metadat import org.apache.nutch.util.MimeUtil; import org.apache.nutch.util.NutchConfiguration; -public final class Content implements Writable{ +public final class Content implements Writable { public static final String DIR_NAME = "content"; @@ -121,11 +121,11 @@ public final class Content implements Wr metadata.readFields(in); // read meta data break; default: - throw new VersionMismatchException((byte)2, oldVersion); + throw new VersionMismatchException((byte) 2, oldVersion); } } - + public final void readFields(DataInput in) throws IOException { metadata.clear(); int sizeOrVersion = in.readInt(); @@ -143,14 +143,14 @@ public final class Content implements Wr metadata.readFields(in); break; default: - throw new VersionMismatchException((byte)VERSION, (byte)version); + throw new VersionMismatchException((byte) VERSION, (byte) version); } } else { // size byte[] compressed = new byte[sizeOrVersion]; in.readFully(compressed, 0, compressed.length); ByteArrayInputStream deflated = new ByteArrayInputStream(compressed); - DataInput inflater = - new DataInputStream(new InflaterInputStream(deflated)); + DataInput inflater = new DataInputStream( + new InflaterInputStream(deflated)); readFieldsCompressed(inflater); } } @@ -184,8 +184,9 @@ public final class Content implements Wr return url; } - /** The base url for relative links contained in the content. - * Maybe be different from url if the request redirected. + /** + * The base url for relative links contained in the content. Maybe be + * different from url if the request redirected. */ public String getBaseUrl() { return base; @@ -200,7 +201,9 @@ public final class Content implements Wr this.content = content; } - /** The media type of the retrieved content. + /** + * The media type of the retrieved content. + * * @see <a href="http://www.iana.org/assignments/media-types/"> * http://www.iana.org/assignments/media-types/</a> */ @@ -258,13 +261,12 @@ public final class Content implements Wr } Options opts = new Options(); Configuration conf = NutchConfiguration.create(); - - GenericOptionsParser parser = - new GenericOptionsParser(conf, opts, argv); - + + GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv); + String[] remainingArgs = parser.getRemainingArgs(); FileSystem fs = FileSystem.get(conf); - + try { int recno = Integer.parseInt(remainingArgs[0]); String segment = remainingArgs[1]; Modified: nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java (original) +++ nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java Thu Jan 29 05:38:59 2015 @@ -27,12 +27,11 @@ import org.apache.nutch.plugin.Pluggable import crawlercommons.robots.BaseRobotRules; - /** A retriever of url content. Implemented by protocol extensions. */ public interface Protocol extends Pluggable, Configurable { /** The name of the extension point. */ public final static String X_POINT_ID = Protocol.class.getName(); - + /** * Property name. If in the current configuration this property is set to * true, protocol implementations should handle "politeness" limits @@ -51,16 +50,19 @@ public interface Protocol extends Plugga */ public final static String CHECK_ROBOTS = "protocol.plugin.check.robots"; - /** Returns the {@link Content} for a fetchlist entry. + /** + * Returns the {@link Content} for a fetchlist entry. */ ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum); /** * Retrieve robot rules applicable for this url. - * @param url url to check - * @param datum page datum + * + * @param url + * url to check + * @param datum + * page datum * @return robot rules (specific for this url or default), never null */ BaseRobotRules getRobotRules(Text url, CrawlDatum datum); } - Modified: nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolFactory.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolFactory.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolFactory.java (original) +++ nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolFactory.java Thu Jan 29 05:38:59 2015 @@ -37,7 +37,8 @@ import org.apache.hadoop.conf.Configurat */ public class ProtocolFactory { - public static final Logger LOG = LoggerFactory.getLogger(ProtocolFactory.class); + public static final Logger LOG = LoggerFactory + .getLogger(ProtocolFactory.class); private ExtensionPoint extensionPoint; Modified: nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolNotFound.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolNotFound.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolNotFound.java (original) +++ nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolNotFound.java Thu Jan 29 05:38:59 2015 @@ -22,7 +22,7 @@ public class ProtocolNotFound extends Pr private String url; public ProtocolNotFound(String url) { - this(url, "protocol not found for url="+url); + this(url, "protocol not found for url=" + url); } public ProtocolNotFound(String url, String message) { @@ -30,5 +30,7 @@ public class ProtocolNotFound extends Pr this.url = url; } - public String getUrl() { return url; } + public String getUrl() { + return url; + } } Modified: nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolOutput.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolOutput.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolOutput.java (original) +++ nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolOutput.java Thu Jan 29 05:38:59 2015 @@ -18,8 +18,9 @@ package org.apache.nutch.protocol; /** - * Simple aggregate to pass from protocol plugins both content and - * protocol status. + * Simple aggregate to pass from protocol plugins both content and protocol + * status. + * * @author Andrzej Bialecki <[email protected]> */ public class ProtocolOutput { @@ -30,12 +31,12 @@ public class ProtocolOutput { this.content = content; this.status = status; } - + public ProtocolOutput(Content content) { this.content = content; this.status = ProtocolStatus.STATUS_SUCCESS; } - + public Content getContent() { return content; } Modified: nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java (original) +++ nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java Thu Jan 29 05:38:59 2015 @@ -30,65 +30,76 @@ import org.apache.hadoop.io.WritableUtil * @author Andrzej Bialecki */ public class ProtocolStatus implements Writable { - + private final static byte VERSION = 2; - + /** Content was retrieved without errors. */ - public static final int SUCCESS = 1; + public static final int SUCCESS = 1; /** Content was not retrieved. Any further errors may be indicated in args. */ - public static final int FAILED = 2; - - /** This protocol was not found. Application may attempt to retry later. */ - public static final int PROTO_NOT_FOUND = 10; + public static final int FAILED = 2; + + /** This protocol was not found. Application may attempt to retry later. */ + public static final int PROTO_NOT_FOUND = 10; /** Resource is gone. */ - public static final int GONE = 11; + public static final int GONE = 11; /** Resource has moved permanently. New url should be found in args. */ - public static final int MOVED = 12; + public static final int MOVED = 12; /** Resource has moved temporarily. New url should be found in args. */ - public static final int TEMP_MOVED = 13; + public static final int TEMP_MOVED = 13; /** Resource was not found. */ - public static final int NOTFOUND = 14; + public static final int NOTFOUND = 14; /** Temporary failure. Application may retry immediately. */ - public static final int RETRY = 15; - /** Unspecified exception occured. Further information may be provided in args. */ - public static final int EXCEPTION = 16; + public static final int RETRY = 15; + /** + * Unspecified exception occured. Further information may be provided in args. + */ + public static final int EXCEPTION = 16; /** Access denied - authorization required, but missing/incorrect. */ - public static final int ACCESS_DENIED = 17; + public static final int ACCESS_DENIED = 17; /** Access denied by robots.txt rules. */ - public static final int ROBOTS_DENIED = 18; + public static final int ROBOTS_DENIED = 18; /** Too many redirects. */ - public static final int REDIR_EXCEEDED = 19; + public static final int REDIR_EXCEEDED = 19; /** Not fetching. */ - public static final int NOTFETCHING = 20; + public static final int NOTFETCHING = 20; /** Unchanged since the last fetch. */ - public static final int NOTMODIFIED = 21; - /** Request was refused by protocol plugins, because it would block. - * The expected number of milliseconds to wait before retry may be provided - * in args. */ - public static final int WOULDBLOCK = 22; + public static final int NOTMODIFIED = 21; + /** + * Request was refused by protocol plugins, because it would block. The + * expected number of milliseconds to wait before retry may be provided in + * args. + */ + public static final int WOULDBLOCK = 22; /** Thread was blocked http.max.delays times during fetching. */ - public static final int BLOCKED = 23; - + public static final int BLOCKED = 23; + // Useful static instances for status codes that don't usually require any // additional arguments. - public static final ProtocolStatus STATUS_SUCCESS = new ProtocolStatus(SUCCESS); + public static final ProtocolStatus STATUS_SUCCESS = new ProtocolStatus( + SUCCESS); public static final ProtocolStatus STATUS_FAILED = new ProtocolStatus(FAILED); public static final ProtocolStatus STATUS_GONE = new ProtocolStatus(GONE); - public static final ProtocolStatus STATUS_NOTFOUND = new ProtocolStatus(NOTFOUND); + public static final ProtocolStatus STATUS_NOTFOUND = new ProtocolStatus( + NOTFOUND); public static final ProtocolStatus STATUS_RETRY = new ProtocolStatus(RETRY); - public static final ProtocolStatus STATUS_ROBOTS_DENIED = new ProtocolStatus(ROBOTS_DENIED); - public static final ProtocolStatus STATUS_REDIR_EXCEEDED = new ProtocolStatus(REDIR_EXCEEDED); - public static final ProtocolStatus STATUS_NOTFETCHING = new ProtocolStatus(NOTFETCHING); - public static final ProtocolStatus STATUS_NOTMODIFIED = new ProtocolStatus(NOTMODIFIED); - public static final ProtocolStatus STATUS_WOULDBLOCK = new ProtocolStatus(WOULDBLOCK); - public static final ProtocolStatus STATUS_BLOCKED = new ProtocolStatus(BLOCKED); - + public static final ProtocolStatus STATUS_ROBOTS_DENIED = new ProtocolStatus( + ROBOTS_DENIED); + public static final ProtocolStatus STATUS_REDIR_EXCEEDED = new ProtocolStatus( + REDIR_EXCEEDED); + public static final ProtocolStatus STATUS_NOTFETCHING = new ProtocolStatus( + NOTFETCHING); + public static final ProtocolStatus STATUS_NOTMODIFIED = new ProtocolStatus( + NOTMODIFIED); + public static final ProtocolStatus STATUS_WOULDBLOCK = new ProtocolStatus( + WOULDBLOCK); + public static final ProtocolStatus STATUS_BLOCKED = new ProtocolStatus( + BLOCKED); + private int code; private long lastModified; private String[] args; - - private static final HashMap<Integer, String> codeToName = - new HashMap<Integer, String>(); + + private static final HashMap<Integer, String> codeToName = new HashMap<Integer, String>(); static { codeToName.put(new Integer(SUCCESS), "success"); codeToName.put(new Integer(FAILED), "failed"); @@ -107,40 +118,41 @@ public class ProtocolStatus implements W codeToName.put(new Integer(WOULDBLOCK), "wouldblock"); codeToName.put(new Integer(BLOCKED), "blocked"); } - + public ProtocolStatus() { - + } public ProtocolStatus(int code, String[] args) { this.code = code; this.args = args; } - + public ProtocolStatus(int code, String[] args, long lastModified) { this.code = code; this.args = args; this.lastModified = lastModified; } - + public ProtocolStatus(int code) { this(code, null); } - + public ProtocolStatus(int code, long lastModified) { this(code, null, lastModified); } - + public ProtocolStatus(int code, Object message) { this(code, message, 0L); } - + public ProtocolStatus(int code, Object message, long lastModified) { this.code = code; this.lastModified = lastModified; - if (message != null) this.args = new String[]{String.valueOf(message)}; + if (message != null) + this.args = new String[] { String.valueOf(message) }; } - + public ProtocolStatus(Throwable t) { this(EXCEPTION, t); } @@ -150,10 +162,10 @@ public class ProtocolStatus implements W res.readFields(in); return res; } - + public void readFields(DataInput in) throws IOException { byte version = in.readByte(); - switch(version) { + switch (version) { case 1: code = in.readByte(); lastModified = in.readLong(); @@ -168,10 +180,10 @@ public class ProtocolStatus implements W throw new VersionMismatchException(VERSION, version); } } - + public void write(DataOutput out) throws IOException { out.writeByte(VERSION); - out.writeByte((byte)code); + out.writeByte((byte) code); out.writeLong(lastModified); if (args == null) { out.writeInt(-1); @@ -183,7 +195,7 @@ public class ProtocolStatus implements W public void setArgs(String[] args) { this.args = args; } - + public String[] getArgs() { return args; } @@ -195,74 +207,77 @@ public class ProtocolStatus implements W public String getName() { return codeToName.get(this.code); } - + public void setCode(int code) { this.code = code; } - + public boolean isSuccess() { - return code == SUCCESS; + return code == SUCCESS; } - + public boolean isTransientFailure() { - return - code == ACCESS_DENIED || - code == EXCEPTION || - code == REDIR_EXCEEDED || - code == RETRY || - code == TEMP_MOVED || - code == WOULDBLOCK || - code == PROTO_NOT_FOUND; + return code == ACCESS_DENIED || code == EXCEPTION || code == REDIR_EXCEEDED + || code == RETRY || code == TEMP_MOVED || code == WOULDBLOCK + || code == PROTO_NOT_FOUND; } - + public boolean isPermanentFailure() { - return - code == FAILED || - code == GONE || - code == MOVED || - code == NOTFOUND || - code == ROBOTS_DENIED; + return code == FAILED || code == GONE || code == MOVED || code == NOTFOUND + || code == ROBOTS_DENIED; } - + public String getMessage() { - if (args != null && args.length > 0) return args[0]; + if (args != null && args.length > 0) + return args[0]; return null; } - + public void setMessage(String msg) { - if (args != null && args.length > 0) args[0] = msg; - else args = new String[] {msg}; + if (args != null && args.length > 0) + args[0] = msg; + else + args = new String[] { msg }; } - + public long getLastModified() { return lastModified; } - + public void setLastModified(long lastModified) { this.lastModified = lastModified; } - + public boolean equals(Object o) { - if (o == null) return false; - if (!(o instanceof ProtocolStatus)) return false; - ProtocolStatus other = (ProtocolStatus)o; - if (this.code != other.code || this.lastModified != other.lastModified) return false; + if (o == null) + return false; + if (!(o instanceof ProtocolStatus)) + return false; + ProtocolStatus other = (ProtocolStatus) o; + if (this.code != other.code || this.lastModified != other.lastModified) + return false; if (this.args == null) { - if (other.args == null) return true; - else return false; + if (other.args == null) + return true; + else + return false; } else { - if (other.args == null) return false; - if (other.args.length != this.args.length) return false; + if (other.args == null) + return false; + if (other.args.length != this.args.length) + return false; for (int i = 0; i < this.args.length; i++) { - if (!this.args[i].equals(other.args[i])) return false; + if (!this.args[i].equals(other.args[i])) + return false; } } return true; } - + public String toString() { StringBuffer res = new StringBuffer(); - res.append(codeToName.get(new Integer(code)) + "(" + code + "), lastModified=" + lastModified); + res.append(codeToName.get(new Integer(code)) + "(" + code + + "), lastModified=" + lastModified); if (args != null) { if (args.length == 1) { res.append(": " + String.valueOf(args[0])); Modified: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRules.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRules.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRules.java (original) +++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRules.java Thu Jan 29 05:38:59 2015 @@ -35,9 +35,8 @@ public interface RobotRules { public long getCrawlDelay(); /** - * Returns <code>false</code> if the <code>robots.txt</code> file - * prohibits us from accessing the given <code>url</code>, or - * <code>true</code> otherwise. + * Returns <code>false</code> if the <code>robots.txt</code> file prohibits us + * from accessing the given <code>url</code>, or <code>true</code> otherwise. */ public boolean isAllowed(URL url); Modified: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java (original) +++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Thu Jan 29 05:38:59 2015 @@ -43,35 +43,38 @@ import crawlercommons.robots.SimpleRobot import crawlercommons.robots.SimpleRobotRulesParser; /** - * This class uses crawler-commons for handling the parsing of {@code robots.txt} files. - * It emits SimpleRobotRules objects, which describe the download permissions - * as described in SimpleRobotRulesParser. + * This class uses crawler-commons for handling the parsing of + * {@code robots.txt} files. It emits SimpleRobotRules objects, which describe + * the download permissions as described in SimpleRobotRulesParser. */ public abstract class RobotRulesParser implements Configurable { - public static final Logger LOG = LoggerFactory.getLogger(RobotRulesParser.class); + public static final Logger LOG = LoggerFactory + .getLogger(RobotRulesParser.class); - protected static final Hashtable<String, BaseRobotRules> CACHE = new Hashtable<String, BaseRobotRules> (); + protected static final Hashtable<String, BaseRobotRules> CACHE = new Hashtable<String, BaseRobotRules>(); /** - * A {@link BaseRobotRules} object appropriate for use - * when the {@code robots.txt} file is empty or missing; - * all requests are allowed. + * A {@link BaseRobotRules} object appropriate for use when the + * {@code robots.txt} file is empty or missing; all requests are allowed. */ - public static final BaseRobotRules EMPTY_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_ALL); + public static final BaseRobotRules EMPTY_RULES = new SimpleRobotRules( + RobotRulesMode.ALLOW_ALL); /** - * A {@link BaseRobotRules} object appropriate for use when the - * {@code robots.txt} file is not fetched due to a {@code 403/Forbidden} - * response; all requests are disallowed. + * A {@link BaseRobotRules} object appropriate for use when the + * {@code robots.txt} file is not fetched due to a {@code 403/Forbidden} + * response; all requests are disallowed. */ - public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_NONE); + public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules( + RobotRulesMode.ALLOW_NONE); private static SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser(); private Configuration conf; protected String agentNames; - public RobotRulesParser() { } + public RobotRulesParser() { + } public RobotRulesParser(Configuration conf) { setConf(conf); @@ -90,16 +93,18 @@ public abstract class RobotRulesParser i } agentNames = agentName; - // If there are any other agents specified, append those to the list of agents + // If there are any other agents specified, append those to the list of + // agents String otherAgents = conf.get("http.robots.agents"); - if(otherAgents != null && !otherAgents.trim().isEmpty()) { + if (otherAgents != null && !otherAgents.trim().isEmpty()) { StringTokenizer tok = new StringTokenizer(otherAgents, ","); StringBuilder sb = new StringBuilder(agentNames); while (tok.hasMoreTokens()) { String str = tok.nextToken().trim(); if (str.equals("*") || str.equals(agentName)) { // skip wildcard "*" or agent name itself - // (required for backward compatibility, cf. NUTCH-1715 and NUTCH-1718) + // (required for backward compatibility, cf. NUTCH-1715 and + // NUTCH-1718) } else { sb.append(",").append(str); } @@ -117,16 +122,23 @@ public abstract class RobotRulesParser i } /** - * Parses the robots content using the {@link SimpleRobotRulesParser} from crawler commons - * - * @param url A string containing url - * @param content Contents of the robots file in a byte array - * @param contentType The content type of the robots file - * @param robotName A string containing all the robots agent names used by parser for matching - * @return BaseRobotRules object + * Parses the robots content using the {@link SimpleRobotRulesParser} from + * crawler commons + * + * @param url + * A string containing url + * @param content + * Contents of the robots file in a byte array + * @param contentType + * The content type of the robots file + * @param robotName + * A string containing all the robots agent names used by parser for + * matching + * @return BaseRobotRules object */ - public BaseRobotRules parseRules (String url, byte[] content, String contentType, String robotName) { - return robotParser.parseContent(url, content, contentType, robotName); + public BaseRobotRules parseRules(String url, byte[] content, + String contentType, String robotName) { + return robotParser.parseContent(url, content, contentType, robotName); } public BaseRobotRules getRobotRulesSet(Protocol protocol, Text url) { @@ -145,23 +157,30 @@ public abstract class RobotRulesParser i public static void main(String[] argv) { if (argv.length != 3) { - System.err.println("Usage: RobotRulesParser <robots-file> <url-file> <agent-names>\n"); - System.err.println("\tThe <robots-file> will be parsed as a robots.txt file,"); - System.err.println("\tusing the given <agent-name> to select rules. URLs "); - System.err.println("\twill be read (one per line) from <url-file>, and tested"); - System.err.println("\tagainst the rules. Multiple agent names can be provided using"); + System.err + .println("Usage: RobotRulesParser <robots-file> <url-file> <agent-names>\n"); + System.err + .println("\tThe <robots-file> will be parsed as a robots.txt file,"); + System.err + .println("\tusing the given <agent-name> to select rules. URLs "); + System.err + .println("\twill be read (one per line) from <url-file>, and tested"); + System.err + .println("\tagainst the rules. Multiple agent names can be provided using"); System.err.println("\tcomma as a delimiter without any spaces."); System.exit(-1); } try { byte[] robotsBytes = Files.toByteArray(new File(argv[0])); - BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", argv[2]); + BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, + "text/plain", argv[2]); LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1])); String testPath = testsIn.readLine().trim(); while (testPath != null) { - System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") + ":\t" + testPath); + System.out.println((rules.isAllowed(testPath) ? "allowed" + : "not allowed") + ":\t" + testPath); testPath = testsIn.readLine(); } testsIn.close(); Modified: nutch/trunk/src/java/org/apache/nutch/protocol/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/protocol/package-info.java (original) +++ nutch/trunk/src/java/org/apache/nutch/protocol/package-info.java Thu Jan 29 05:38:59 2015 @@ -20,3 +20,4 @@ * see also {@link org.apache.nutch.net.protocols}. */ package org.apache.nutch.protocol; + Modified: nutch/trunk/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java Thu Jan 29 05:38:59 2015 @@ -17,52 +17,52 @@ import org.apache.nutch.scoring.ScoringF public abstract class AbstractScoringFilter implements ScoringFilter { - private Configuration conf; + private Configuration conf; - public Configuration getConf() { - return conf; - } - - public void setConf(Configuration conf) { - this.conf = conf; - } - - public void injectedScore(Text url, CrawlDatum datum) - throws ScoringFilterException { - } - - public void initialScore(Text url, CrawlDatum datum) - throws ScoringFilterException { - } - - public float generatorSortValue(Text url, CrawlDatum datum, float initSort) - throws ScoringFilterException { - return initSort; - } - - public void passScoreBeforeParsing(Text url, CrawlDatum datum, - Content content) throws ScoringFilterException { - } - - public void passScoreAfterParsing(Text url, Content content, Parse parse) - throws ScoringFilterException { - } - - public CrawlDatum distributeScoreToOutlinks(Text fromUrl, - ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, - CrawlDatum adjust, int allCount) throws ScoringFilterException { - return adjust; - } - - public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, - List<CrawlDatum> inlinked) throws ScoringFilterException { - } - - @Override - public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, - CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) - throws ScoringFilterException { - return initScore; - } + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public void injectedScore(Text url, CrawlDatum datum) + throws ScoringFilterException { + } + + public void initialScore(Text url, CrawlDatum datum) + throws ScoringFilterException { + } + + public float generatorSortValue(Text url, CrawlDatum datum, float initSort) + throws ScoringFilterException { + return initSort; + } + + public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) + throws ScoringFilterException { + } + + public void passScoreAfterParsing(Text url, Content content, Parse parse) + throws ScoringFilterException { + } + + public CrawlDatum distributeScoreToOutlinks(Text fromUrl, + ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, + CrawlDatum adjust, int allCount) throws ScoringFilterException { + return adjust; + } + + public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, + List<CrawlDatum> inlinked) throws ScoringFilterException { + } + + @Override + public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, + CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) + throws ScoringFilterException { + return initScore; + } } Modified: nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java Thu Jan 29 05:38:59 2015 @@ -33,129 +33,181 @@ import org.apache.nutch.protocol.Content /** * A contract defining behavior of scoring plugins. * - * A scoring filter will manipulate scoring variables in CrawlDatum and - * in resulting search indexes. Filters can be chained in a specific order, - * to provide multi-stage scoring adjustments. + * A scoring filter will manipulate scoring variables in CrawlDatum and in + * resulting search indexes. Filters can be chained in a specific order, to + * provide multi-stage scoring adjustments. * * @author Andrzej Bialecki */ public interface ScoringFilter extends Configurable, Pluggable { /** The name of the extension point. */ public final static String X_POINT_ID = ScoringFilter.class.getName(); - + /** * Set an initial score for newly injected pages. Note: newly injected pages - * may have no inlinks, so filter implementations may wish to set this - * score to a non-zero value, to give newly injected pages some initial - * credit. - * @param url url of the page - * @param datum new datum. Filters will modify it in-place. + * may have no inlinks, so filter implementations may wish to set this score + * to a non-zero value, to give newly injected pages some initial credit. + * + * @param url + * url of the page + * @param datum + * new datum. Filters will modify it in-place. * @throws ScoringFilterException */ - public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException; - + public void injectedScore(Text url, CrawlDatum datum) + throws ScoringFilterException; + /** - * Set an initial score for newly discovered pages. Note: newly discovered pages - * have at least one inlink with its score contribution, so filter implementations - * may choose to set initial score to zero (unknown value), and then the inlink - * score contribution will set the "real" value of the new page. - * @param url url of the page - * @param datum new datum. Filters will modify it in-place. + * Set an initial score for newly discovered pages. Note: newly discovered + * pages have at least one inlink with its score contribution, so filter + * implementations may choose to set initial score to zero (unknown value), + * and then the inlink score contribution will set the "real" value of the new + * page. + * + * @param url + * url of the page + * @param datum + * new datum. Filters will modify it in-place. * @throws ScoringFilterException */ - public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException; - + public void initialScore(Text url, CrawlDatum datum) + throws ScoringFilterException; + /** - * This method prepares a sort value for the purpose of sorting and - * selecting top N scoring pages during fetchlist generation. - * @param url url of the page - * @param datum page's datum, should not be modified - * @param initSort initial sort value, or a value from previous filters in chain + * This method prepares a sort value for the purpose of sorting and selecting + * top N scoring pages during fetchlist generation. + * + * @param url + * url of the page + * @param datum + * page's datum, should not be modified + * @param initSort + * initial sort value, or a value from previous filters in chain */ - public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException; - + public float generatorSortValue(Text url, CrawlDatum datum, float initSort) + throws ScoringFilterException; + /** * This method takes all relevant score information from the current datum * (coming from a generated fetchlist) and stores it into - * {@link org.apache.nutch.protocol.Content} metadata. - * This is needed in order to pass this value(s) to the mechanism that distributes it - * to outlinked pages. - * @param url url of the page - * @param datum source datum. NOTE: modifications to this value are not persisted. - * @param content instance of content. Implementations may modify this - * in-place, primarily by setting some metadata properties. + * {@link org.apache.nutch.protocol.Content} metadata. This is needed in order + * to pass this value(s) to the mechanism that distributes it to outlinked + * pages. + * + * @param url + * url of the page + * @param datum + * source datum. NOTE: modifications to this value are not persisted. + * @param content + * instance of content. Implementations may modify this in-place, + * primarily by setting some metadata properties. */ - public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) throws ScoringFilterException; - + public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) + throws ScoringFilterException; + /** * Currently a part of score distribution is performed using only data coming * from the parsing process. We need this method in order to ensure the * presence of score data in these steps. - * @param url page url - * @param content original content. NOTE: modifications to this value are not persisted. - * @param parse target instance to copy the score information to. Implementations - * may modify this in-place, primarily by setting some metadata properties. + * + * @param url + * page url + * @param content + * original content. NOTE: modifications to this value are not + * persisted. + * @param parse + * target instance to copy the score information to. Implementations + * may modify this in-place, primarily by setting some metadata + * properties. */ - public void passScoreAfterParsing(Text url, Content content, Parse parse) throws ScoringFilterException; - + public void passScoreAfterParsing(Text url, Content content, Parse parse) + throws ScoringFilterException; + /** * Distribute score value from the current page to all its outlinked pages. - * @param fromUrl url of the source page - * @param parseData ParseData instance, which stores relevant score value(s) - * in its metadata. NOTE: filters may modify this in-place, all changes will - * be persisted. - * @param targets <url, CrawlDatum> pairs. NOTE: filters can modify this in-place, - * all changes will be persisted. - * @param adjust a CrawlDatum instance, initially null, which implementations - * may use to pass adjustment values to the original CrawlDatum. When creating - * this instance, set its status to {@link CrawlDatum#STATUS_LINKED}. - * @param allCount number of all collected outlinks from the source page + * + * @param fromUrl + * url of the source page + * @param parseData + * ParseData instance, which stores relevant score value(s) in its + * metadata. NOTE: filters may modify this in-place, all changes will + * be persisted. + * @param targets + * <url, CrawlDatum> pairs. NOTE: filters can modify this + * in-place, all changes will be persisted. + * @param adjust + * a CrawlDatum instance, initially null, which implementations may + * use to pass adjustment values to the original CrawlDatum. When + * creating this instance, set its status to + * {@link CrawlDatum#STATUS_LINKED}. + * @param allCount + * number of all collected outlinks from the source page * @return if needed, implementations may return an instance of CrawlDatum, - * with status {@link CrawlDatum#STATUS_LINKED}, which contains adjustments - * to be applied to the original CrawlDatum score(s) and metadata. This can - * be null if not needed. + * with status {@link CrawlDatum#STATUS_LINKED}, which contains + * adjustments to be applied to the original CrawlDatum score(s) and + * metadata. This can be null if not needed. * @throws ScoringFilterException */ - public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, - Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, - int allCount) throws ScoringFilterException; - - /** - * This method calculates a new score of CrawlDatum during CrawlDb update, based on the - * initial value of the original CrawlDatum, and also score values contributed by - * inlinked pages. - * @param url url of the page - * @param old original datum, with original score. May be null if this is a newly - * discovered page. If not null, filters should use score values from this parameter - * as the starting values - the <code>datum</code> parameter may contain values that are - * no longer valid, if other updates occured between generation and this update. - * @param datum the new datum, with the original score saved at the time when - * fetchlist was generated. Filters should update this in-place, and it will be saved in - * the crawldb. - * @param inlinked (partial) list of CrawlDatum-s (with their scores) from - * links pointing to this page, found in the current update batch. + public CrawlDatum distributeScoreToOutlinks(Text fromUrl, + ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, + CrawlDatum adjust, int allCount) throws ScoringFilterException; + + /** + * This method calculates a new score of CrawlDatum during CrawlDb update, + * based on the initial value of the original CrawlDatum, and also score + * values contributed by inlinked pages. + * + * @param url + * url of the page + * @param old + * original datum, with original score. May be null if this is a + * newly discovered page. If not null, filters should use score + * values from this parameter as the starting values - the + * <code>datum</code> parameter may contain values that are no longer + * valid, if other updates occured between generation and this + * update. + * @param datum + * the new datum, with the original score saved at the time when + * fetchlist was generated. Filters should update this in-place, and + * it will be saved in the crawldb. + * @param inlinked + * (partial) list of CrawlDatum-s (with their scores) from links + * pointing to this page, found in the current update batch. * @throws ScoringFilterException */ - public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) throws ScoringFilterException; - + public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, + List<CrawlDatum> inlinked) throws ScoringFilterException; + /** * This method calculates a Lucene document boost. - * @param url url of the page - * @param doc Lucene document. NOTE: this already contains all information collected - * by indexing filters. Implementations may modify this instance, in order to store/remove - * some information. - * @param dbDatum current page from CrawlDb. NOTE: changes made to this instance - * are not persisted. - * @param fetchDatum datum from FetcherOutput (containing among others the fetching status) - * @param parse parsing result. NOTE: changes made to this instance are not persisted. - * @param inlinks current inlinks from LinkDb. NOTE: changes made to this instance are - * not persisted. - * @param initScore initial boost value for the Lucene document. - * @return boost value for the Lucene document. This value is passed as an argument - * to the next scoring filter in chain. NOTE: implementations may also express - * other scoring strategies by modifying Lucene document directly. + * + * @param url + * url of the page + * @param doc + * Lucene document. NOTE: this already contains all information + * collected by indexing filters. Implementations may modify this + * instance, in order to store/remove some information. + * @param dbDatum + * current page from CrawlDb. NOTE: changes made to this instance are + * not persisted. + * @param fetchDatum + * datum from FetcherOutput (containing among others the fetching + * status) + * @param parse + * parsing result. NOTE: changes made to this instance are not + * persisted. + * @param inlinks + * current inlinks from LinkDb. NOTE: changes made to this instance + * are not persisted. + * @param initScore + * initial boost value for the Lucene document. + * @return boost value for the Lucene document. This value is passed as an + * argument to the next scoring filter in chain. NOTE: implementations + * may also express other scoring strategies by modifying Lucene + * document directly. * @throws ScoringFilterException */ public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, - CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException; + CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) + throws ScoringFilterException; }
