Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java Fri Jan 9 06:34:33 2015 @@ -22,44 +22,52 @@ public interface ParseStatusCodes { // Primary status codes: /** Parsing was not performed. */ - public static final byte NOTPARSED = 0; + public static final byte NOTPARSED = 0; /** Parsing succeeded. */ - public static final byte SUCCESS = 1; + public static final byte SUCCESS = 1; /** General failure. There may be a more specific error message in arguments. */ - public static final byte FAILED = 2; + public static final byte FAILED = 2; - public static final String[] majorCodes = { - "notparsed", - "success", - "failed" - }; + public static final String[] majorCodes = { "notparsed", "success", "failed" }; // Secondary success codes go here: - public static final short SUCCESS_OK = 0; + public static final short SUCCESS_OK = 0; - /** Parsed content contains a directive to redirect to another URL. - * The target URL can be retrieved from the arguments. + /** + * Parsed content contains a directive to redirect to another URL. The target + * URL can be retrieved from the arguments. */ - public static final short SUCCESS_REDIRECT = 100; + public static final short SUCCESS_REDIRECT = 100; // Secondary failure codes go here: - /** Parsing failed. An Exception occured (which may be retrieved from the arguments). */ - public static final short FAILED_EXCEPTION = 200; - /** Parsing failed. Content was truncated, but the parser cannot handle incomplete content. */ - public static final short FAILED_TRUNCATED = 202; - /** Parsing failed. Invalid format - the content may be corrupted or of wrong type. */ - public static final short FAILED_INVALID_FORMAT = 203; - /** Parsing failed. Other related parts of the content are needed to complete + /** + * Parsing failed. An Exception occured (which may be retrieved from the + * arguments). + */ + public static final short FAILED_EXCEPTION = 200; + /** + * Parsing failed. Content was truncated, but the parser cannot handle + * incomplete content. + */ + public static final short FAILED_TRUNCATED = 202; + /** + * Parsing failed. Invalid format - the content may be corrupted or of wrong + * type. + */ + public static final short FAILED_INVALID_FORMAT = 203; + /** + * Parsing failed. Other related parts of the content are needed to complete * parsing. The list of URLs to missing parts may be provided in arguments. * The Fetcher may decide to fetch these parts at once, then put them into * Content.metadata, and supply them for re-parsing. */ - public static final short FAILED_MISSING_PARTS = 204; - /** Parsing failed. There was no content to be parsed - probably caused - * by errors at protocol stage. + public static final short FAILED_MISSING_PARTS = 204; + /** + * Parsing failed. There was no content to be parsed - probably caused by + * errors at protocol stage. */ - public static final short FAILED_MISSING_CONTENT = 205; - + public static final short FAILED_MISSING_CONTENT = 205; + }
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java Fri Jan 9 06:34:33 2015 @@ -29,10 +29,10 @@ import java.util.List; public class ParseStatusUtils { public static ParseStatus STATUS_SUCCESS = ParseStatus.newBuilder().build(); - public static final HashMap<Short,String> minorCodes = new HashMap<Short,String>(); + public static final HashMap<Short, String> minorCodes = new HashMap<Short, String>(); static { - STATUS_SUCCESS.setMajorCode((int)ParseStatusCodes.SUCCESS); + STATUS_SUCCESS.setMajorCode((int) ParseStatusCodes.SUCCESS); minorCodes.put(ParseStatusCodes.SUCCESS_OK, "ok"); minorCodes.put(ParseStatusCodes.SUCCESS_REDIRECT, "redirect"); minorCodes.put(ParseStatusCodes.FAILED_EXCEPTION, "exception"); @@ -49,8 +49,9 @@ public class ParseStatusUtils { return status.getMajorCode() == ParseStatusCodes.SUCCESS; } - /** A convenience method. Return a String representation of the first - * argument, or null. + /** + * A convenience method. Return a String representation of the first argument, + * or null. */ public static String getMessage(ParseStatus status) { List<CharSequence> args = status.getArgs(); @@ -77,29 +78,30 @@ public class ParseStatusUtils { public static Parse getEmptyParse(Exception e, Configuration conf) { ParseStatus status = ParseStatus.newBuilder().build(); - status.setMajorCode((int)ParseStatusCodes.FAILED); - status.setMinorCode((int)ParseStatusCodes.FAILED_EXCEPTION); + status.setMajorCode((int) ParseStatusCodes.FAILED); + status.setMinorCode((int) ParseStatusCodes.FAILED_EXCEPTION); status.getArgs().add(new Utf8(e.toString())); return new Parse("", "", new Outlink[0], status); } - public static Parse getEmptyParse(int minorCode, String message, Configuration conf) { + public static Parse getEmptyParse(int minorCode, String message, + Configuration conf) { ParseStatus status = ParseStatus.newBuilder().build(); - status.setMajorCode((int)ParseStatusCodes.FAILED); + status.setMajorCode((int) ParseStatusCodes.FAILED); status.setMinorCode(minorCode); status.getArgs().add(new Utf8(message)); return new Parse("", "", new Outlink[0], status); } - + public static String toString(ParseStatus status) { if (status == null) { return "(null)"; } StringBuilder sb = new StringBuilder(); - sb.append(ParseStatusCodes.majorCodes[status.getMajorCode()] + - "/" + minorCodes.get(status.getMinorCode().shortValue())); + sb.append(ParseStatusCodes.majorCodes[status.getMajorCode()] + "/" + + minorCodes.get(status.getMinorCode().shortValue())); sb.append(" (" + status.getMajorCode() + "/" + status.getMinorCode() + ")"); sb.append(", args=["); List<CharSequence> args = status.getArgs(); @@ -107,7 +109,8 @@ public class ParseStatusUtils { int i = 0; Iterator<CharSequence> it = args.iterator(); while (it.hasNext()) { - if (i > 0) sb.append(','); + if (i > 0) + sb.append(','); sb.append(it.next()); i++; } Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Fri Jan 9 06:34:33 2015 @@ -49,7 +49,7 @@ import java.util.concurrent.TimeUnit; * A Utility class containing methods to simply perform parsing utilities such * as iterating through a preferred list of {@link Parser}s to obtain * {@link Parse} objects. - * + * * @author mattmann * @author Jérôme Charron * @author Sébastien Le Callonnec @@ -60,7 +60,7 @@ public class ParseUtil extends Configure public static final Logger LOG = LoggerFactory.getLogger(ParseUtil.class); private static final int DEFAULT_MAX_PARSE_TIME = 30; - + private Configuration conf; private Signature sig; private URLFilters filters; @@ -71,9 +71,9 @@ public class ParseUtil extends Configure /** Parser timeout set to 30 sec by default. Set -1 to deactivate **/ private int maxParseTime; private ExecutorService executorService; - + /** - * + * * @param conf */ public ParseUtil(Configuration conf) { @@ -90,15 +90,16 @@ public class ParseUtil extends Configure public void setConf(Configuration conf) { this.conf = conf; parserFactory = new ParserFactory(conf); - maxParseTime=conf.getInt("parser.timeout", DEFAULT_MAX_PARSE_TIME); + maxParseTime = conf.getInt("parser.timeout", DEFAULT_MAX_PARSE_TIME); sig = SignatureFactory.getSignature(conf); filters = new URLFilters(conf); normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK); int maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100); - maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage; + maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE + : maxOutlinksPerPage; ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false); executorService = Executors.newCachedThreadPool(new ThreadFactoryBuilder() - .setNameFormat("parse-%d").setDaemon(true).build()); + .setNameFormat("parse-%d").setDaemon(true).build()); } /** @@ -106,11 +107,13 @@ public class ParseUtil extends Configure * until a successful parse is performed and a {@link Parse} object is * returned. If the parse is unsuccessful, a message is logged to the * <code>WARNING</code> level, and an empty parse is returned. - * - * @throws ParserNotFound If there is no suitable parser found. - * @throws ParseException If there is an error parsing. + * + * @throws ParserNotFound + * If there is no suitable parser found. + * @throws ParseException + * If there is an error parsing. */ - public Parse parse(String url, WebPage page) throws ParserNotFound, + public Parse parse(String url, WebPage page) throws ParserNotFound, ParseException { Parser[] parsers = null; @@ -118,28 +121,29 @@ public class ParseUtil extends Configure parsers = this.parserFactory.getParsers(contentType, url); - for (int i=0; i<parsers.length; i++) { + for (int i = 0; i < parsers.length; i++) { if (LOG.isDebugEnabled()) { LOG.debug("Parsing [" + url + "] with [" + parsers[i] + "]"); } Parse parse = null; - - if (maxParseTime!=-1) - parse = runParser(parsers[i], url, page); - else - parse = parsers[i].getParse(url, page); - - if (parse!=null && ParseStatusUtils.isSuccess(parse.getParseStatus())) { + + if (maxParseTime != -1) + parse = runParser(parsers[i], url, page); + else + parse = parsers[i].getParse(url, page); + + if (parse != null && ParseStatusUtils.isSuccess(parse.getParseStatus())) { return parse; } } - LOG.warn("Unable to successfully parse content " + url + - " of type " + contentType); - return ParseStatusUtils.getEmptyParse(new ParseException("Unable to successfully parse content"), null); + LOG.warn("Unable to successfully parse content " + url + " of type " + + contentType); + return ParseStatusUtils.getEmptyParse(new ParseException( + "Unable to successfully parse content"), null); } - - private Parse runParser(Parser p, String url, WebPage page) { + + private Parse runParser(Parser p, String url, WebPage page) { ParseCallable pc = new ParseCallable(p, page, url); Future<Parse> task = executorService.submit(pc); Parse res = null; @@ -155,8 +159,9 @@ public class ParseUtil extends Configure } /** - * Parses given web page and stores parsed content within page. Puts - * a meta-redirect to outlinks. + * Parses given web page and stores parsed content within page. Puts a + * meta-redirect to outlinks. + * * @param key * @param page */ @@ -165,7 +170,8 @@ public class ParseUtil extends Configure byte status = page.getStatus().byteValue(); if (status != CrawlStatus.STATUS_FETCHED) { if (LOG.isDebugEnabled()) { - LOG.debug("Skipping " + url + " as status is: " + CrawlStatus.getName(status)); + LOG.debug("Skipping " + url + " as status is: " + + CrawlStatus.getName(status)); } return; } @@ -213,7 +219,8 @@ public class ParseUtil extends Configure return; } page.getOutlinks().put(new Utf8(newUrl), new Utf8()); - page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL); + page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED, + TableUtil.YES_VAL); if (newUrl == null || newUrl.equals(url)) { String reprUrl = URLUtil.chooseRepr(url, newUrl, refreshTime < FetcherJob.PERM_REFRESH_TIME); Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/Parser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/Parser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/Parser.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/Parser.java Fri Jan 9 06:34:33 2015 @@ -22,9 +22,10 @@ import org.apache.hadoop.conf.Configurab import org.apache.nutch.plugin.FieldPluggable; import org.apache.nutch.storage.WebPage; -/** A parser for content generated by a {@link org.apache.nutch.protocol.Protocol} - * implementation. This interface is implemented by extensions. Nutch's core - * contains no page parsing code. +/** + * A parser for content generated by a + * {@link org.apache.nutch.protocol.Protocol} implementation. This interface is + * implemented by extensions. Nutch's core contains no page parsing code. */ public interface Parser extends FieldPluggable, Configurable { /** The name of the extension point. */ @@ -34,8 +35,9 @@ public interface Parser extends FieldPlu * <p> * This method parses content in WebPage instance * </p> - * - * @param url Page's URL + * + * @param url + * Page's URL * @param page */ Parse getParse(String url, WebPage page); Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Fri Jan 9 06:34:33 2015 @@ -37,28 +37,30 @@ import java.util.Map; import java.util.Map.Entry; /** - * Parser checker, useful for testing parser. - * It also accurately reports possible fetching and - * parsing failures and presents protocol status signals to aid - * debugging. The tool enables us to retrieve the following data from - * any url: + * Parser checker, useful for testing parser. It also accurately reports + * possible fetching and parsing failures and presents protocol status signals + * to aid debugging. The tool enables us to retrieve the following data from any + * url: * <ol> - * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content} type.</li> - * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) and is used to remove - * duplicates during the dedup procedure. - * It is calculated using {@link org.apache.nutch.crawl.MD5Signature} or + * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content} + * type.</li> + * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) and + * is used to remove duplicates during the dedup procedure. It is calculated + * using {@link org.apache.nutch.crawl.MD5Signature} or * {@link org.apache.nutch.crawl.TextProfileSignature}.</li> * <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li> * <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li> * <li><tt>Title</tt>: of the URL</li> * <li><tt>Outlinks</tt>: associated with the URL</li> * <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>, - * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>, <i>Cache-Control</>, etc.</li> + * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>, + * <i>Cache-Control</>, etc.</li> * <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>, * <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li> - * <li><tt>ParseText</tt>: The page parse text which varies in length depdnecing on - * <code>content.length</code> configuration.</li> + * <li><tt>ParseText</tt>: The page parse text which varies in length depdnecing + * on <code>content.length</code> configuration.</li> * </ol> + * * @author John Xing */ @@ -107,7 +109,7 @@ public class ParserChecker implements To ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page); - if(!protocolOutput.getStatus().isSuccess()) { + if (!protocolOutput.getStatus().isSuccess()) { LOG.error("Fetch failed with protocol status: " + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode()) + ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus())); @@ -155,7 +157,6 @@ public class ParserChecker implements To LOG.info("signature: " + StringUtil.toHexString(signature)); } - LOG.info("---------\nUrl\n---------------\n"); System.out.print(url + "\n"); LOG.info("---------\nMetadata\n---------\n"); @@ -167,7 +168,7 @@ public class ParserChecker implements To while (iterator.hasNext()) { Entry<CharSequence, ByteBuffer> entry = iterator.next(); sb.append(entry.getKey().toString()).append(" : \t") - .append(Bytes.toString(entry.getValue())).append("\n"); + .append(Bytes.toString(entry.getValue())).append("\n"); } System.out.print(sb.toString()); } @@ -182,12 +183,12 @@ public class ParserChecker implements To Map<CharSequence, CharSequence> headers = page.getHeaders(); StringBuffer headersb = new StringBuffer(); if (metadata != null) { - Iterator<Entry<CharSequence, CharSequence>> iterator = headers.entrySet() - .iterator(); + Iterator<Entry<CharSequence, CharSequence>> iterator = headers + .entrySet().iterator(); while (iterator.hasNext()) { Entry<CharSequence, CharSequence> entry = iterator.next(); headersb.append(entry.getKey().toString()).append(" : \t") - .append(entry.getValue()).append("\n"); + .append(entry.getValue()).append("\n"); } System.out.print(headersb.toString()); } Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserFactory.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserFactory.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserFactory.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserFactory.java Fri Jan 9 06:34:33 2015 @@ -34,8 +34,7 @@ import org.apache.nutch.storage.WebPage; import org.apache.nutch.util.MimeUtil; import org.apache.nutch.util.ObjectCache; - -/** Creates and caches {@link Parser} plugins.*/ +/** Creates and caches {@link Parser} plugins. */ public final class ParserFactory { public static final Logger LOG = LoggerFactory.getLogger(ParserFactory.class); @@ -44,8 +43,7 @@ public final class ParserFactory { public static final String DEFAULT_PLUGIN = "*"; /** Empty extension list for caching purposes. */ - private final List<Extension> EMPTY_EXTENSION_LIST = - new ArrayList<Extension>(); + private final List<Extension> EMPTY_EXTENSION_LIST = new ArrayList<Extension>(); private final Configuration conf; private final ExtensionPoint extensionPoint; @@ -56,10 +54,12 @@ public final class ParserFactory { ObjectCache objectCache = ObjectCache.get(conf); this.extensionPoint = PluginRepository.get(conf).getExtensionPoint( Parser.X_POINT_ID); - this.parsePluginList = (ParsePluginList)objectCache.getObject(ParsePluginList.class.getName()); + this.parsePluginList = (ParsePluginList) objectCache + .getObject(ParsePluginList.class.getName()); if (this.parsePluginList == null) { this.parsePluginList = new ParsePluginsReader().parse(conf); - objectCache.setObject(ParsePluginList.class.getName(), this.parsePluginList); + objectCache.setObject(ParsePluginList.class.getName(), + this.parsePluginList); } if (this.extensionPoint == null) { @@ -71,33 +71,34 @@ public final class ParserFactory { } } - /** * Function returns an array of {@link Parser}s for a given content type. - * + * * The function consults the internal list of parse plugins for the - * ParserFactory to determine the list of pluginIds, then gets the - * appropriate extension points to instantiate as {@link Parser}s. - * - * @param contentType The contentType to return the <code>Array</code> - * of {@link Parser}s for. - * @param url The url for the content that may allow us to get the type from - * the file suffix. + * ParserFactory to determine the list of pluginIds, then gets the appropriate + * extension points to instantiate as {@link Parser}s. + * + * @param contentType + * The contentType to return the <code>Array</code> of {@link Parser} + * s for. + * @param url + * The url for the content that may allow us to get the type from the + * file suffix. * @return An <code>Array</code> of {@link Parser}s for the given contentType. * If there were plugins mapped to a contentType via the - * <code>parse-plugins.xml</code> file, but never enabled via - * the <code>plugin.includes</code> Nutch conf, then those plugins - * won't be part of this array, i.e., they will be skipped. - * So, if the ordered list of parsing plugins for - * <code>text/plain</code> was <code>[parse-text,parse-html, + * <code>parse-plugins.xml</code> file, but never enabled via the + * <code>plugin.includes</code> Nutch conf, then those plugins won't + * be part of this array, i.e., they will be skipped. So, if the + * ordered list of parsing plugins for <code>text/plain</code> was + * <code>[parse-text,parse-html, * parse-rtf]</code>, and only <code>parse-html</code> and * <code>parse-rtf</code> were enabled via - * <code>plugin.includes</code>, then this ordered Array would - * consist of two {@link Parser} interfaces, + * <code>plugin.includes</code>, then this ordered Array would consist + * of two {@link Parser} interfaces, * <code>[parse-html, parse-rtf]</code>. */ public Parser[] getParsers(String contentType, String url) - throws ParserNotFound { + throws ParserNotFound { List<Parser> parsers = null; List<Extension> parserExts = null; @@ -107,7 +108,7 @@ public final class ParserFactory { // TODO once the MimeTypes is available // parsers = getExtensions(MimeUtils.map(contentType)); // if (parsers != null) { - // return parsers; + // return parsers; // } // Last Chance: Guess content-type from file url... // parsers = getExtensions(MimeUtils.getMimeType(url)); @@ -118,49 +119,50 @@ public final class ParserFactory { } parsers = new ArrayList<Parser>(parserExts.size()); - for (Extension ext : parserExts){ + for (Extension ext : parserExts) { Parser p = null; try { - //check to see if we've cached this parser instance yet + // check to see if we've cached this parser instance yet p = (Parser) objectCache.getObject(ext.getId()); if (p == null) { // go ahead and instantiate it and then cache it p = (Parser) ext.getExtensionInstance(); - objectCache.setObject(ext.getId(),p); + objectCache.setObject(ext.getId(), p); } parsers.add(p); } catch (PluginRuntimeException e) { if (LOG.isWarnEnabled()) { LOG.warn("ParserFactory:PluginRuntimeException when " - + "initializing parser plugin " - + ext.getDescriptor().getPluginId() - + " instance in getParsers " - + "function: attempting to continue instantiating parsers: ", e); + + "initializing parser plugin " + + ext.getDescriptor().getPluginId() + " instance in getParsers " + + "function: attempting to continue instantiating parsers: ", e); } } } - return parsers.toArray(new Parser[]{}); + return parsers.toArray(new Parser[] {}); } /** * Function returns a {@link Parser} instance with the specified - * <code>extId</code>, representing its extension ID. If the Parser - * instance isn't found, then the function throws a - * <code>ParserNotFound</code> exception. If the function is able to find - * the {@link Parser} in the internal <code>PARSER_CACHE</code> then it - * will return the already instantiated Parser. Otherwise, if it has to - * instantiate the Parser itself , then this function will cache that Parser - * in the internal <code>PARSER_CACHE</code>. - * - * @param id The string extension ID (e.g., - * "org.apache.nutch.parse.rss.RSSParser", - * "org.apache.nutch.parse.rtf.RTFParseFactory") of the {@link Parser} - * implementation to return. + * <code>extId</code>, representing its extension ID. If the Parser instance + * isn't found, then the function throws a <code>ParserNotFound</code> + * exception. If the function is able to find the {@link Parser} in the + * internal <code>PARSER_CACHE</code> then it will return the already + * instantiated Parser. Otherwise, if it has to instantiate the Parser itself + * , then this function will cache that Parser in the internal + * <code>PARSER_CACHE</code>. + * + * @param id + * The string extension ID (e.g., + * "org.apache.nutch.parse.rss.RSSParser", + * "org.apache.nutch.parse.rtf.RTFParseFactory") of the + * {@link Parser} implementation to return. * @return A {@link Parser} implementation specified by the parameter * <code>id</code>. - * @throws ParserNotFound If the Parser is not found (i.e., registered with - * the extension point), or if the there a - * {@link PluginRuntimeException} instantiating the {@link Parser}. + * @throws ParserNotFound + * If the Parser is not found (i.e., registered with the extension + * point), or if the there a {@link PluginRuntimeException} + * instantiating the {@link Parser}. */ public Parser getParserById(String id) throws ParserNotFound { @@ -184,7 +186,7 @@ public final class ParserFactory { if (objectCache.getObject(parserExt.getId()) != null) { return (Parser) objectCache.getObject(parserExt.getId()); - // if not found in cache, instantiate the Parser + // if not found in cache, instantiate the Parser } else { try { Parser p = (Parser) parserExt.getExtensionInstance(); @@ -192,9 +194,9 @@ public final class ParserFactory { return p; } catch (PluginRuntimeException e) { if (LOG.isWarnEnabled()) { - LOG.warn("Canno initialize parser " + - parserExt.getDescriptor().getPluginId() + - " (cause: " + e.toString()); + LOG.warn("Canno initialize parser " + + parserExt.getDescriptor().getPluginId() + " (cause: " + + e.toString()); } throw new ParserNotFound("Cannot init parser for id [" + id + "]"); } @@ -212,7 +214,7 @@ public final class ParserFactory { columns.addAll(pluginFields); } } catch (PluginRuntimeException e) { - LOG.error("PluginRuntimeException",e); + LOG.error("PluginRuntimeException", e); } } return columns; @@ -220,10 +222,11 @@ public final class ParserFactory { /** * Finds the best-suited parse plugin for a given contentType. - * - * @param contentType Content-Type for which we seek a parse plugin. - * @return a list of extensions to be used for this contentType. - * If none, returns <code>null</code>. + * + * @param contentType + * Content-Type for which we seek a parse plugin. + * @return a list of extensions to be used for this contentType. If none, + * returns <code>null</code>. */ @SuppressWarnings("unchecked") protected List<Extension> getExtensions(String contentType) { @@ -246,8 +249,8 @@ public final class ParserFactory { if (extensions != null) { objectCache.setObject(type, extensions); } else { - // Put the empty extension list into cache - // to remember we don't know any related extension. + // Put the empty extension list into cache + // to remember we don't know any related extension. objectCache.setObject(type, EMPTY_EXTENSION_LIST); } } @@ -256,22 +259,24 @@ public final class ParserFactory { /** * searches a list of suitable parse plugins for the given contentType. - * <p>It first looks for a preferred plugin defined in the parse-plugin - * file. If none is found, it returns a list of default plugins. - * - * @param contentType Content-Type for which we seek a parse plugin. - * @return List - List of extensions to be used for this contentType. - * If none, returns null. + * <p> + * It first looks for a preferred plugin defined in the parse-plugin file. If + * none is found, it returns a list of default plugins. + * + * @param contentType + * Content-Type for which we seek a parse plugin. + * @return List - List of extensions to be used for this contentType. If none, + * returns null. */ private List<Extension> findExtensions(String contentType) { Extension[] extensions = this.extensionPoint.getExtensions(); // Look for a preferred plugin. - List<String> parsePluginList = - this.parsePluginList.getPluginList(contentType); - List<Extension> extensionList = - matchExtensions(parsePluginList, extensions, contentType); + List<String> parsePluginList = this.parsePluginList + .getPluginList(contentType); + List<Extension> extensionList = matchExtensions(parsePluginList, + extensions, contentType); if (extensionList != null) { return extensionList; } @@ -284,20 +289,23 @@ public final class ParserFactory { /** * Tries to find a suitable parser for the given contentType. * <ol> - * <li>It checks if a parser which accepts the contentType - * can be found in the <code>plugins</code> list;</li> - * <li>If this list is empty, it tries to find amongst the loaded - * extensions whether some of them might suit and warns the user.</li> + * <li>It checks if a parser which accepts the contentType can be found in the + * <code>plugins</code> list;</li> + * <li>If this list is empty, it tries to find amongst the loaded extensions + * whether some of them might suit and warns the user.</li> * </ol> - * @param plugins List of candidate plugins. - * @param extensions Array of loaded extensions. - * @param contentType Content-Type for which we seek a parse plugin. - * @return List - List of extensions to be used for this contentType. - * If none, returns null. + * + * @param plugins + * List of candidate plugins. + * @param extensions + * Array of loaded extensions. + * @param contentType + * Content-Type for which we seek a parse plugin. + * @return List - List of extensions to be used for this contentType. If none, + * returns null. */ private List<Extension> matchExtensions(List<String> plugins, - Extension[] extensions, - String contentType) { + Extension[] extensions, String contentType) { List<Extension> extList = new ArrayList<Extension>(); if (plugins != null) { @@ -315,7 +323,7 @@ public final class ParserFactory { // in either case, LOG the appropriate error message to WARN level if (ext == null) { - //try to get it just by its pluginId + // try to get it just by its pluginId ext = getExtension(extensions, parsePluginId); if (LOG.isWarnEnabled()) { @@ -323,17 +331,17 @@ public final class ParserFactory { // plugin was enabled via plugin.includes // its plugin.xml just doesn't claim to support that // particular mimeType - LOG.warn("ParserFactory:Plugin: " + parsePluginId + - " mapped to contentType " + contentType + - " via parse-plugins.xml, but " + "its plugin.xml " + - "file does not claim to support contentType: " + - contentType); + LOG.warn("ParserFactory:Plugin: " + parsePluginId + + " mapped to contentType " + contentType + + " via parse-plugins.xml, but " + "its plugin.xml " + + "file does not claim to support contentType: " + + contentType); } else { // plugin wasn't enabled via plugin.includes - LOG.warn("ParserFactory: Plugin: " + parsePluginId + - " mapped to contentType " + contentType + - " via parse-plugins.xml, but not enabled via " + - "plugin.includes in nutch-default.xml"); + LOG.warn("ParserFactory: Plugin: " + parsePluginId + + " mapped to contentType " + contentType + + " via parse-plugins.xml, but not enabled via " + + "plugin.includes in nutch-default.xml"); } } } @@ -353,12 +361,12 @@ public final class ParserFactory { // any extensions where this is the case, throw a // NotMappedParserException - for (int i=0; i<extensions.length; i++) { - if ("*".equals(extensions[i].getAttribute("contentType"))){ + for (int i = 0; i < extensions.length; i++) { + if ("*".equals(extensions[i].getAttribute("contentType"))) { extList.add(0, extensions[i]); - } - else if (extensions[i].getAttribute("contentType") != null - && contentType.matches(escapeContentType(extensions[i].getAttribute("contentType")))) { + } else if (extensions[i].getAttribute("contentType") != null + && contentType.matches(escapeContentType(extensions[i] + .getAttribute("contentType")))) { extList.add(extensions[i]); } } @@ -367,21 +375,23 @@ public final class ParserFactory { if (LOG.isInfoEnabled()) { StringBuffer extensionsIDs = new StringBuffer("["); boolean isFirst = true; - for (Extension ext : extList){ - if (!isFirst) extensionsIDs.append(" - "); - else isFirst=false; - extensionsIDs.append(ext.getId()); + for (Extension ext : extList) { + if (!isFirst) + extensionsIDs.append(" - "); + else + isFirst = false; + extensionsIDs.append(ext.getId()); } - extensionsIDs.append("]"); - LOG.info("The parsing plugins: " + extensionsIDs.toString() + - " are enabled via the plugin.includes system " + - "property, and all claim to support the content type " + - contentType + ", but they are not mapped to it in the " + - "parse-plugins.xml file"); + extensionsIDs.append("]"); + LOG.info("The parsing plugins: " + extensionsIDs.toString() + + " are enabled via the plugin.includes system " + + "property, and all claim to support the content type " + + contentType + ", but they are not mapped to it in the " + + "parse-plugins.xml file"); } } else if (LOG.isDebugEnabled()) { - LOG.debug("ParserFactory:No parse plugins mapped or enabled for " + - "contentType " + contentType); + LOG.debug("ParserFactory:No parse plugins mapped or enabled for " + + "contentType " + contentType); } } @@ -389,23 +399,22 @@ public final class ParserFactory { } private String escapeContentType(String contentType) { - // Escapes contentType in order to use as a regex - // (and keep backwards compatibility). - // This enables to accept multiple types for a single parser. - return contentType.replace("+", "\\+").replace(".", "\\."); - } - + // Escapes contentType in order to use as a regex + // (and keep backwards compatibility). + // This enables to accept multiple types for a single parser. + return contentType.replace("+", "\\+").replace(".", "\\."); + } - private boolean match(Extension extension, String id, String type) { - return (id.equals(extension.getId())) && - (extension.getAttribute("contentType").equals("*") || - type.matches(escapeContentType(extension.getAttribute("contentType"))) || - type.equals(DEFAULT_PLUGIN)); + private boolean match(Extension extension, String id, String type) { + return (id.equals(extension.getId())) + && (extension.getAttribute("contentType").equals("*") + || type.matches(escapeContentType(extension + .getAttribute("contentType"))) || type.equals(DEFAULT_PLUGIN)); } /** Get an extension from its id and supported content-type. */ private Extension getExtension(Extension[] list, String id, String type) { - for (int i=0; i<list.length; i++) { + for (int i = 0; i < list.length; i++) { if (match(list[i], id, type)) { return list[i]; } @@ -414,7 +423,7 @@ public final class ParserFactory { } private Extension getExtension(Extension[] list, String id) { - for (int i=0; i<list.length; i++) { + for (int i = 0; i < list.length; i++) { if (id.equals(list[i].getId())) { return list[i]; } Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java Fri Jan 9 06:34:33 2015 @@ -58,9 +58,9 @@ public class ParserJob extends NutchTool private static final String RESUME_KEY = "parse.job.resume"; private static final String FORCE_KEY = "parse.job.force"; - + public static final String SKIP_TRUNCATED = "parser.skip.truncated"; - + private static final Utf8 REPARSE = new Utf8("-reparse"); private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>(); @@ -79,9 +79,8 @@ public class ParserJob extends NutchTool FIELDS.add(WebPage.Field.HEADERS); } - - public static class ParserMapper - extends GoraMapper<String, WebPage, String, WebPage> { + public static class ParserMapper extends + GoraMapper<String, WebPage, String, WebPage> { private ParseUtil parseUtil; private boolean shouldResume; @@ -91,15 +90,16 @@ public class ParserJob extends NutchTool private Utf8 batchId; private boolean skipTruncated; - + @Override public void setup(Context context) throws IOException { Configuration conf = context.getConfiguration(); parseUtil = new ParseUtil(conf); shouldResume = conf.getBoolean(RESUME_KEY, false); force = conf.getBoolean(FORCE_KEY, false); - batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR)); - skipTruncated=conf.getBoolean(SKIP_TRUNCATED, true); + batchId = new Utf8( + conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR)); + skipTruncated = conf.getBoolean(SKIP_TRUNCATED, true); } @Override @@ -131,7 +131,6 @@ public class ParserJob extends NutchTool if (skipTruncated && isTruncated(unreverseKey, page)) { return; } - parseUtil.process(key, page); ParseStatus pstatus = page.getParseStatus(); @@ -141,9 +140,9 @@ public class ParserJob extends NutchTool } context.write(key, page); - } + } } - + public ParserJob() { } @@ -151,20 +150,22 @@ public class ParserJob extends NutchTool public ParserJob(Configuration conf) { setConf(conf); } - + /** * Checks if the page's content is truncated. - * @param url + * + * @param url * @param page - * @return If the page is truncated <code>true</code>. When it is not, - * or when it could be determined, <code>false</code>. + * @return If the page is truncated <code>true</code>. When it is not, or when + * it could be determined, <code>false</code>. */ public static boolean isTruncated(String url, WebPage page) { ByteBuffer content = page.getContent(); if (content == null) { return false; } - CharSequence lengthUtf8 = page.getHeaders().get(new Utf8(HttpHeaders.CONTENT_LENGTH)); + CharSequence lengthUtf8 = page.getHeaders().get( + new Utf8(HttpHeaders.CONTENT_LENGTH)); if (lengthUtf8 == null) { return false; } @@ -186,7 +187,8 @@ public class ParserJob extends NutchTool return true; } if (LOG.isDebugEnabled()) { - LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize); + LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + + inHeaderSize); } return false; } @@ -198,8 +200,8 @@ public class ParserJob extends NutchTool ParseFilters parseFilters = new ParseFilters(conf); Collection<WebPage.Field> parsePluginFields = parserFactory.getFields(); - Collection<WebPage.Field> signaturePluginFields = - SignatureFactory.getFields(conf); + Collection<WebPage.Field> signaturePluginFields = SignatureFactory + .getFields(conf); Collection<WebPage.Field> htmlParsePluginFields = parseFilters.getFields(); if (parsePluginFields != null) { @@ -226,11 +228,11 @@ public class ParserJob extends NutchTool } @Override - public Map<String,Object> run(Map<String,Object> args) throws Exception { - String batchId = (String)args.get(Nutch.ARG_BATCH); - Boolean shouldResume = (Boolean)args.get(Nutch.ARG_RESUME); - Boolean force = (Boolean)args.get(Nutch.ARG_FORCE); - + public Map<String, Object> run(Map<String, Object> args) throws Exception { + String batchId = (String) args.get(Nutch.ARG_BATCH); + Boolean shouldResume = (Boolean) args.get(Nutch.ARG_RESUME); + Boolean force = (Boolean) args.get(Nutch.ARG_FORCE); + if (batchId != null) { getConf().set(GeneratorJob.BATCH_ID, batchId); } @@ -241,17 +243,18 @@ public class ParserJob extends NutchTool getConf().setBoolean(FORCE_KEY, force); } LOG.info("ParserJob: resuming:\t" + getConf().getBoolean(RESUME_KEY, false)); - LOG.info("ParserJob: forced reparse:\t" + getConf().getBoolean(FORCE_KEY, false)); + LOG.info("ParserJob: forced reparse:\t" + + getConf().getBoolean(FORCE_KEY, false)); if (batchId == null || batchId.equals(Nutch.ALL_BATCH_ID_STR)) { LOG.info("ParserJob: parsing all"); } else { LOG.info("ParserJob: batchId:\t" + batchId); } currentJob = new NutchJob(getConf(), "parse"); - + Collection<WebPage.Field> fields = getFields(currentJob); MapFieldValueFilter<String, WebPage> batchIdFilter = getBatchIdFilter(batchId); - StorageUtils.initMapperJob(currentJob, fields, String.class, WebPage.class, + StorageUtils.initMapperJob(currentJob, fields, String.class, WebPage.class, ParserMapper.class, batchIdFilter); StorageUtils.initReducerJob(currentJob, IdentityPageReducer.class); currentJob.setNumReduceTasks(0); @@ -275,20 +278,20 @@ public class ParserJob extends NutchTool return filter; } - public int parse(String batchId, boolean shouldResume, boolean force) throws Exception { - + public int parse(String batchId, boolean shouldResume, boolean force) + throws Exception { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("ParserJob: starting at " + sdf.format(start)); - run(ToolUtil.toArgMap( - Nutch.ARG_BATCH, batchId, - Nutch.ARG_RESUME, shouldResume, - Nutch.ARG_FORCE, force)); + run(ToolUtil.toArgMap(Nutch.ARG_BATCH, batchId, Nutch.ARG_RESUME, + shouldResume, Nutch.ARG_FORCE, force)); LOG.info("ParserJob: success"); - + long finish = System.currentTimeMillis(); - LOG.info("ParserJob: finished at " + sdf.format(finish) + ", time elapsed: " + TimingUtil.elapsedTime(start, finish)); + LOG.info("ParserJob: finished at " + sdf.format(finish) + + ", time elapsed: " + TimingUtil.elapsedTime(start, finish)); return 0; } @@ -298,12 +301,18 @@ public class ParserJob extends NutchTool String batchId = null; if (args.length < 1) { - System.err.println("Usage: ParserJob (<batchId> | -all) [-crawlId <id>] [-resume] [-force]"); - System.err.println(" <batchId> - symbolic batch ID created by Generator"); - System.err.println(" -crawlId <id> - the id to prefix the schemas to operate on, \n \t \t (default: storage.crawl.id)"); - System.err.println(" -all - consider pages from all crawl jobs"); - System.err.println(" -resume - resume a previous incomplete job"); - System.err.println(" -force - force re-parsing even if a page is already parsed"); + System.err + .println("Usage: ParserJob (<batchId> | -all) [-crawlId <id>] [-resume] [-force]"); + System.err + .println(" <batchId> - symbolic batch ID created by Generator"); + System.err + .println(" -crawlId <id> - the id to prefix the schemas to operate on, \n \t \t (default: storage.crawl.id)"); + System.err + .println(" -all - consider pages from all crawl jobs"); + System.err + .println(" -resume - resume a previous incomplete job"); + System.err + .println(" -force - force re-parsing even if a page is already parsed"); return -1; } for (int i = 0; i < args.length; i++) { Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserNotFound.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserNotFound.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserNotFound.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserNotFound.java Fri Jan 9 06:34:33 2015 @@ -18,17 +18,17 @@ package org.apache.nutch.parse; public class ParserNotFound extends ParseException { - private static final long serialVersionUID=23993993939L; + private static final long serialVersionUID = 23993993939L; private String url; private String contentType; - public ParserNotFound(String message){ - super(message); + public ParserNotFound(String message) { + super(message); } - + public ParserNotFound(String url, String contentType) { - this(url, contentType, - "parser not found for contentType="+contentType+" url="+url); + this(url, contentType, "parser not found for contentType=" + contentType + + " url=" + url); } public ParserNotFound(String url, String contentType, String message) { @@ -37,6 +37,11 @@ public class ParserNotFound extends Pars this.contentType = contentType; } - public String getUrl() { return url; } - public String getContentType() { return contentType; } + public String getUrl() { + return url; + } + + public String getContentType() { + return contentType; + } } Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/package-info.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/package-info.java Fri Jan 9 06:34:33 2015 @@ -19,3 +19,4 @@ * The {@link org.apache.nutch.parse.Parse Parse} interface and related classes. */ package org.apache.nutch.parse; + Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/CircularDependencyException.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/CircularDependencyException.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/plugin/CircularDependencyException.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/CircularDependencyException.java Fri Jan 9 06:34:33 2015 @@ -16,7 +16,6 @@ */ package org.apache.nutch.plugin; - /** * <code>CircularDependencyException</code> will be thrown if a circular * dependency is detected. Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/Extension.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/Extension.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/plugin/Extension.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/Extension.java Fri Jan 9 06:34:33 2015 @@ -94,8 +94,10 @@ public class Extension { * Adds a attribute and is only used until model creation at plugin system * start up. * - * @param pKey a key - * @param pValue a value + * @param pKey + * a key + * @param pValue + * a value */ public void addAttribute(String pKey, String pValue) { fAttributes.put(pKey, pValue); @@ -105,7 +107,8 @@ public class Extension { * Sets the Class that implement the concret extension and is only used until * model creation at system start up. * - * @param extensionClazz The extensionClasname to set + * @param extensionClazz + * The extensionClasname to set */ public void setClazz(String extensionClazz) { fClazz = extensionClazz; @@ -115,7 +118,8 @@ public class Extension { * Sets the unique extension Id and is only used until model creation at * system start up. * - * @param extensionID The extensionID to set + * @param extensionID + * The extensionID to set */ public void setId(String extensionID) { fId = extensionID; @@ -147,10 +151,10 @@ public class Extension { // The same is in PluginRepository.getPluginInstance(). // Suggested by Stefan Groschupf <[email protected]> synchronized (getId()) { - try { + try { PluginRepository pluginRepository = PluginRepository.get(conf); - Class extensionClazz = - pluginRepository.getCachedClass(fDescriptor, getClazz()); + Class extensionClazz = pluginRepository.getCachedClass(fDescriptor, + getClazz()); // lazy loading of Plugin in case there is no instance of the plugin // already. pluginRepository.getPluginInstance(getDescriptor()); Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/ExtensionPoint.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/ExtensionPoint.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/plugin/ExtensionPoint.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/ExtensionPoint.java Fri Jan 9 06:34:33 2015 @@ -15,6 +15,7 @@ * limitations under the License. */ package org.apache.nutch.plugin; + import java.util.ArrayList; /** @@ -76,7 +77,8 @@ public class ExtensionPoint { /** * Sets the extensionPointId. * - * @param pId extension point id + * @param pId + * extension point id */ private void setId(String pId) { ftId = pId; Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/MissingDependencyException.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/MissingDependencyException.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/plugin/MissingDependencyException.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/MissingDependencyException.java Fri Jan 9 06:34:33 2015 @@ -17,8 +17,8 @@ package org.apache.nutch.plugin; /** - * <code>MissingDependencyException</code> will be thrown if a plugin - * dependency cannot be found. + * <code>MissingDependencyException</code> will be thrown if a plugin dependency + * cannot be found. * * @author Jérôme Charron */ Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/Pluggable.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/Pluggable.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/plugin/Pluggable.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/Pluggable.java Fri Jan 9 06:34:33 2015 @@ -17,15 +17,14 @@ package org.apache.nutch.plugin; /** - * Defines the capability of a class to be plugged into Nutch. - * This is a common interface that must be implemented by all - * Nutch Extension Points. - * + * Defines the capability of a class to be plugged into Nutch. This is a common + * interface that must be implemented by all Nutch Extension Points. + * * @author Jérôme Charron - * + * * @see <a href="http://wiki.apache.org/nutch/AboutPlugins">About Plugins</a> - * @see <a href="package-summary.html#package_description"> - * plugin package description</a> + * @see <a href="package-summary.html#package_description"> plugin package + * description</a> */ public interface Pluggable { } Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/Plugin.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/Plugin.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/plugin/Plugin.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/Plugin.java Fri Jan 9 06:34:33 2015 @@ -33,8 +33,8 @@ import org.apache.hadoop.conf.Configurat * The <code>Plugin</code> will be startuped and shutdown by the nutch plugin * management system. * - * A possible usecase of the <code>Plugin</code> implementation is to create - * or close a database connection. + * A possible usecase of the <code>Plugin</code> implementation is to create or + * close a database connection. * * @author joa23 */ @@ -81,7 +81,8 @@ public class Plugin { } /** - * @param descriptor The descriptor to set + * @param descriptor + * The descriptor to set */ private void setDescriptor(PluginDescriptor descriptor) { fDescriptor = descriptor; Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginClassLoader.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginClassLoader.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginClassLoader.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginClassLoader.java Fri Jan 9 06:34:33 2015 @@ -45,11 +45,11 @@ public class PluginClassLoader extends U */ public PluginClassLoader(URL[] urls, ClassLoader parent) { super(urls, parent); - + this.urls = urls; this.parent = parent; } - + @Override public int hashCode() { final int PRIME = 31; Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java Fri Jan 9 06:34:33 2015 @@ -30,12 +30,11 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; /** - * The <code>PluginDescriptor</code> provide access to all meta information of - * a nutch-plugin, as well to the internationalizable resources and the plugin - * own classloader. There are meta information about <code>Plugin</code>, - * <code>ExtensionPoint</code> and <code>Extension</code>. To provide - * access to the meta data of a plugin via a descriptor allow a lazy loading - * mechanism. + * The <code>PluginDescriptor</code> provide access to all meta information of a + * nutch-plugin, as well to the internationalizable resources and the plugin own + * classloader. There are meta information about <code>Plugin</code>, + * <code>ExtensionPoint</code> and <code>Extension</code>. To provide access to + * the meta data of a plugin via a descriptor allow a lazy loading mechanism. */ public class PluginDescriptor { private String fPluginPath; @@ -51,7 +50,8 @@ public class PluginDescriptor { private ArrayList<URL> fNotExportedLibs = new ArrayList<URL>(); private ArrayList<Extension> fExtensions = new ArrayList<Extension>(); private PluginClassLoader fClassLoader; - public static final Logger LOG = LoggerFactory.getLogger(PluginDescriptor.class); + public static final Logger LOG = LoggerFactory + .getLogger(PluginDescriptor.class); private Configuration fConf; /** @@ -204,7 +204,8 @@ public class PluginDescriptor { /** * Adds a dependency * - * @param pId id of the dependent plugin + * @param pId + * id of the dependent plugin */ public void addDependency(String pId) { fDependencies.add(pId); @@ -217,7 +218,8 @@ public class PluginDescriptor { */ public void addExportedLibRelative(String pLibPath) throws MalformedURLException { - URL url = new File(getPluginPath() + File.separator + pLibPath).toURI().toURL(); + URL url = new File(getPluginPath() + File.separator + pLibPath).toURI() + .toURL(); fExportedLibs.add(url); } @@ -246,7 +248,8 @@ public class PluginDescriptor { */ public void addNotExportedLibRelative(String pLibPath) throws MalformedURLException { - URL url = new File(getPluginPath() + File.separator + pLibPath).toURI().toURL(); + URL url = new File(getPluginPath() + File.separator + pLibPath).toURI() + .toURL(); fNotExportedLibs.add(url); } @@ -283,8 +286,8 @@ public class PluginDescriptor { LOG.debug(getPluginId() + " " + e.toString()); } URL[] urls = arrayList.toArray(new URL[arrayList.size()]); - fClassLoader = new PluginClassLoader(urls, PluginDescriptor.class - .getClassLoader()); + fClassLoader = new PluginClassLoader(urls, + PluginDescriptor.class.getClassLoader()); return fClassLoader; } @@ -306,7 +309,7 @@ public class PluginDescriptor { for (String id : pDescriptor.getDependencies()) { PluginDescriptor descriptor = PluginRepository.get(fConf) .getPluginDescriptor(id); - for (URL url: descriptor.getExportedLibUrls()) { + for (URL url : descriptor.getExportedLibUrls()) { pLibs.add(url); } collectLibs(pLibs, descriptor); Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java Fri Jan 9 06:34:33 2015 @@ -39,8 +39,8 @@ import org.w3c.dom.NodeList; import org.xml.sax.SAXException; /** - * The <code>PluginManifestParser</code> parser just parse the manifest file - * in all plugin directories. + * The <code>PluginManifestParser</code> parser just parse the manifest file in + * all plugin directories. * * @author joa23 */ @@ -93,7 +93,8 @@ public class PluginManifestParser { PluginDescriptor p = parseManifestFile(manifestPath); map.put(p.getPluginId(), p); } catch (Exception e) { - LOG.warn("Error while loading plugin `" + manifestPath + "` " + e.toString()); + LOG.warn("Error while loading plugin `" + manifestPath + "` " + + e.toString()); } } } @@ -182,7 +183,7 @@ public class PluginManifestParser { PluginDescriptor pluginDescriptor = new PluginDescriptor(id, version, name, providerName, pluginClazz, pPath, this.conf); LOG.debug("plugin: id=" + id + " name=" + name + " version=" + version - + " provider=" + providerName + "class=" + pluginClazz); + + " provider=" + providerName + "class=" + pluginClazz); parseExtension(rootElement, pluginDescriptor); parseExtensionPoints(rootElement, pluginDescriptor); parseLibraries(rootElement, pluginDescriptor); @@ -289,8 +290,8 @@ public class PluginManifestParser { if (parameters != null) { for (int k = 0; k < parameters.getLength(); k++) { Element param = (Element) parameters.item(k); - extension.addAttribute(param.getAttribute(ATTR_NAME), param - .getAttribute("value")); + extension.addAttribute(param.getAttribute(ATTR_NAME), + param.getAttribute("value")); } } pPluginDescriptor.addExtension(extension); Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java Fri Jan 9 06:34:33 2015 @@ -50,13 +50,13 @@ public class PluginRepository { private HashMap<String, ExtensionPoint> fExtensionPoints; private HashMap<String, Plugin> fActivatedPlugins; - - private static final Map<String, Map<PluginClassLoader, Class>> CLASS_CACHE = - new HashMap<String, Map<PluginClassLoader,Class>>(); + + private static final Map<String, Map<PluginClassLoader, Class>> CLASS_CACHE = new HashMap<String, Map<PluginClassLoader, Class>>(); private Configuration conf; - public static final Logger LOG = LoggerFactory.getLogger(PluginRepository.class); + public static final Logger LOG = LoggerFactory + .getLogger(PluginRepository.class); /** * @throws PluginRuntimeException @@ -68,7 +68,8 @@ public class PluginRepository { this.conf = new Configuration(conf); this.auto = conf.getBoolean("plugin.auto-activation", true); String[] pluginFolders = conf.getStrings("plugin.folders"); - PluginManifestParser manifestParser = new PluginManifestParser(this.conf, this); + PluginManifestParser manifestParser = new PluginManifestParser(this.conf, + this); Map<String, PluginDescriptor> allPlugins = manifestParser .parsePluginFolder(pluginFolders); if (allPlugins.isEmpty()) { @@ -85,7 +86,7 @@ public class PluginRepository { try { installExtensions(fRegisteredPlugins); } catch (PluginRuntimeException e) { - LOG.error(e.toString()); + LOG.error(e.toString()); throw new RuntimeException(e.getMessage()); } displayStatus(); @@ -112,8 +113,8 @@ public class PluginRepository { return; } - for (PluginDescriptor plugin: plugins) { - for(ExtensionPoint point:plugin.getExtenstionPoints()) { + for (PluginDescriptor plugin : plugins) { + for (ExtensionPoint point : plugin.getExtenstionPoints()) { String xpId = point.getId(); LOG.debug("Adding extension point " + xpId); fExtensionPoints.put(xpId, point); @@ -128,7 +129,7 @@ public class PluginRepository { throws PluginRuntimeException { for (PluginDescriptor descriptor : pRegisteredPlugins) { - for(Extension extension:descriptor.getExtensions()) { + for (Extension extension : descriptor.getExtensions()) { String xpId = extension.getTargetPoint(); ExtensionPoint point = getExtensionPoint(xpId); if (point == null) { @@ -156,7 +157,7 @@ public class PluginRepository { branch.put(plugin.getPluginId(), plugin); // Otherwise, checks each dependency - for(String id:plugin.getDependencies()) { + for (String id : plugin.getDependencies()) { PluginDescriptor dependency = plugins.get(id); if (dependency == null) { throw new MissingDependencyException("Missing dependency " + id @@ -271,7 +272,8 @@ public class PluginRepository { // The same is in Extension.getExtensionInstance(). // Suggested by Stefan Groschupf <[email protected]> synchronized (pDescriptor) { - Class<?> pluginClass = getCachedClass(pDescriptor, pDescriptor.getPluginClass()); + Class<?> pluginClass = getCachedClass(pDescriptor, + pDescriptor.getPluginClass()); Constructor<?> constructor = pluginClass.getConstructor(new Class<?>[] { PluginDescriptor.class, Configuration.class }); Plugin plugin = (Plugin) constructor.newInstance(new Object[] { @@ -312,9 +314,9 @@ public class PluginRepository { plugin.shutDown(); } } - + public Class getCachedClass(PluginDescriptor pDescriptor, String className) - throws ClassNotFoundException { + throws ClassNotFoundException { Map<PluginClassLoader, Class> descMap = CLASS_CACHE.get(className); if (descMap == null) { descMap = new HashMap<PluginClassLoader, Class>(); Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRuntimeException.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRuntimeException.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRuntimeException.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRuntimeException.java Fri Jan 9 06:34:33 2015 @@ -16,6 +16,7 @@ * limitations under the License. */ package org.apache.nutch.plugin; + /** * <code>PluginRuntimeException</code> will be thrown until a exception in the * plugin managemnt occurs. Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java Fri Jan 9 06:34:33 2015 @@ -41,7 +41,7 @@ import org.apache.nutch.metadata.Metadat import org.apache.nutch.util.MimeUtil; import org.apache.nutch.util.NutchConfiguration; -public final class Content implements Writable{ +public final class Content implements Writable { public static final String DIR_NAME = "content"; @@ -85,7 +85,7 @@ public final class Content implements Wr this.mimeTypes = new MimeUtil(conf); this.contentType = getContentType(contentType, url, content); } - + public Content(String url, String base, byte[] content, String contentType, Metadata metadata, MimeUtil mimeTypes) { @@ -141,11 +141,11 @@ public final class Content implements Wr metadata.readFields(in); // read meta data break; default: - throw new VersionMismatchException((byte)2, oldVersion); + throw new VersionMismatchException((byte) 2, oldVersion); } } - + public final void readFields(DataInput in) throws IOException { metadata.clear(); int sizeOrVersion = in.readInt(); @@ -163,14 +163,14 @@ public final class Content implements Wr metadata.readFields(in); break; default: - throw new VersionMismatchException((byte)VERSION, (byte)version); + throw new VersionMismatchException((byte) VERSION, (byte) version); } } else { // size byte[] compressed = new byte[sizeOrVersion]; in.readFully(compressed, 0, compressed.length); ByteArrayInputStream deflated = new ByteArrayInputStream(compressed); - DataInput inflater = - new DataInputStream(new InflaterInputStream(deflated)); + DataInput inflater = new DataInputStream( + new InflaterInputStream(deflated)); readFieldsCompressed(inflater); } } @@ -204,8 +204,9 @@ public final class Content implements Wr return url; } - /** The base url for relative links contained in the content. - * Maybe be different from url if the request redirected. + /** + * The base url for relative links contained in the content. Maybe be + * different from url if the request redirected. */ public String getBaseUrl() { return base; @@ -220,7 +221,9 @@ public final class Content implements Wr this.content = content; } - /** The media type of the retrieved content. + /** + * The media type of the retrieved content. + * * @see <a href="http://www.iana.org/assignments/media-types/"> * http://www.iana.org/assignments/media-types/</a> */ @@ -276,9 +279,9 @@ public final class Content implements Wr System.out.println("usage:" + usage); return; } - - GenericOptionsParser optParser = - new GenericOptionsParser(NutchConfiguration.create(), args); + + GenericOptionsParser optParser = new GenericOptionsParser( + NutchConfiguration.create(), args); String[] argv = optParser.getRemainingArgs(); Configuration conf = optParser.getConfiguration(); Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java Fri Jan 9 06:34:33 2015 @@ -25,7 +25,7 @@ import org.apache.nutch.storage.WebPage; import crawlercommons.robots.BaseRobotRules; -/** A retriever of url content. Implemented by protocol extensions. */ +/** A retriever of url content. Implemented by protocol extensions. */ public interface Protocol extends FieldPluggable, Configurable { /** The name of the extension point. */ public final static String X_POINT_ID = Protocol.class.getName(); @@ -55,7 +55,9 @@ public interface Protocol extends FieldP /** * Retrieve robot rules applicable for this url. - * @param url url to check + * + * @param url + * url to check * @param page * @return robot rules (specific for this url or default), never null */ Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolNotFound.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolNotFound.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolNotFound.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolNotFound.java Fri Jan 9 06:34:33 2015 @@ -22,7 +22,7 @@ public class ProtocolNotFound extends Pr private String url; public ProtocolNotFound(String url) { - this(url, "protocol not found for url="+url); + this(url, "protocol not found for url=" + url); } public ProtocolNotFound(String url, String message) { @@ -30,5 +30,7 @@ public class ProtocolNotFound extends Pr this.url = url; } - public String getUrl() { return url; } + public String getUrl() { + return url; + } } Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolOutput.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolOutput.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolOutput.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolOutput.java Fri Jan 9 06:34:33 2015 @@ -17,10 +17,10 @@ package org.apache.nutch.protocol; - /** - * Simple aggregate to pass from protocol plugins both content and - * protocol status. + * Simple aggregate to pass from protocol plugins both content and protocol + * status. + * * @author Andrzej Bialecki <[email protected]> */ public class ProtocolOutput { Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusCodes.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusCodes.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusCodes.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusCodes.java Fri Jan 9 06:34:33 2015 @@ -19,38 +19,42 @@ package org.apache.nutch.protocol; public interface ProtocolStatusCodes { /** Content was retrieved without errors. */ - public static final int SUCCESS = 1; + public static final int SUCCESS = 1; /** Content was not retrieved. Any further errors may be indicated in args. */ - public static final int FAILED = 2; + public static final int FAILED = 2; - /** This protocol was not found. Application may attempt to retry later. */ - public static final int PROTO_NOT_FOUND = 10; + /** This protocol was not found. Application may attempt to retry later. */ + public static final int PROTO_NOT_FOUND = 10; /** Resource is gone. */ - public static final int GONE = 11; + public static final int GONE = 11; /** Resource has moved permanently. New url should be found in args. */ - public static final int MOVED = 12; + public static final int MOVED = 12; /** Resource has moved temporarily. New url should be found in args. */ - public static final int TEMP_MOVED = 13; + public static final int TEMP_MOVED = 13; /** Resource was not found. */ - public static final int NOTFOUND = 14; + public static final int NOTFOUND = 14; /** Temporary failure. Application may retry immediately. */ - public static final int RETRY = 15; - /** Unspecified exception occured. Further information may be provided in args. */ - public static final int EXCEPTION = 16; + public static final int RETRY = 15; + /** + * Unspecified exception occured. Further information may be provided in args. + */ + public static final int EXCEPTION = 16; /** Access denied - authorization required, but missing/incorrect. */ - public static final int ACCESS_DENIED = 17; + public static final int ACCESS_DENIED = 17; /** Access denied by robots.txt rules. */ - public static final int ROBOTS_DENIED = 18; + public static final int ROBOTS_DENIED = 18; /** Too many redirects. */ - public static final int REDIR_EXCEEDED = 19; + public static final int REDIR_EXCEEDED = 19; /** Not fetching. */ - public static final int NOTFETCHING = 20; + public static final int NOTFETCHING = 20; /** Unchanged since the last fetch. */ - public static final int NOTMODIFIED = 21; - /** Request was refused by protocol plugins, because it would block. - * The expected number of milliseconds to wait before retry may be provided - * in args. */ - public static final int WOULDBLOCK = 22; + public static final int NOTMODIFIED = 21; + /** + * Request was refused by protocol plugins, because it would block. The + * expected number of milliseconds to wait before retry may be provided in + * args. + */ + public static final int WOULDBLOCK = 22; /** Thread was blocked http.max.delays times during fetching. */ - public static final int BLOCKED = 23; + public static final int BLOCKED = 23; } Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java Fri Jan 9 06:34:33 2015 @@ -100,7 +100,7 @@ public class ProtocolStatusUtils impleme } return TableUtil.toString(args.iterator().next()); } - + public static String toString(ProtocolStatus status) { if (status == null) { return "(null)"; @@ -113,7 +113,8 @@ public class ProtocolStatusUtils impleme int i = 0; Iterator<CharSequence> it = args.iterator(); while (it.hasNext()) { - if (i > 0) sb.append(','); + if (i > 0) + sb.append(','); sb.append(it.next()); i++; } Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRules.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRules.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRules.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRules.java Fri Jan 9 06:34:33 2015 @@ -35,9 +35,8 @@ public interface RobotRules { public long getCrawlDelay(); /** - * Returns <code>false</code> if the <code>robots.txt</code> file - * prohibits us from accessing the given <code>url</code>, or - * <code>true</code> otherwise. + * Returns <code>false</code> if the <code>robots.txt</code> file prohibits us + * from accessing the given <code>url</code>, or <code>true</code> otherwise. */ public boolean isAllowed(URL url); Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java Fri Jan 9 06:34:33 2015 @@ -43,35 +43,38 @@ import crawlercommons.robots.SimpleRobot import crawlercommons.robots.SimpleRobotRulesParser; /** - * This class uses crawler-commons for handling the parsing of {@code robots.txt} files. - * It emits SimpleRobotRules objects, which describe the download permissions - * as described in SimpleRobotRulesParser. + * This class uses crawler-commons for handling the parsing of + * {@code robots.txt} files. It emits SimpleRobotRules objects, which describe + * the download permissions as described in SimpleRobotRulesParser. */ public abstract class RobotRulesParser implements Configurable { - public static final Logger LOG = LoggerFactory.getLogger(RobotRulesParser.class); + public static final Logger LOG = LoggerFactory + .getLogger(RobotRulesParser.class); - protected static final Hashtable<String, BaseRobotRules> CACHE = new Hashtable<String, BaseRobotRules> (); + protected static final Hashtable<String, BaseRobotRules> CACHE = new Hashtable<String, BaseRobotRules>(); /** - * A {@link BaseRobotRules} object appropriate for use - * when the {@code robots.txt} file is empty or missing; - * all requests are allowed. + * A {@link BaseRobotRules} object appropriate for use when the + * {@code robots.txt} file is empty or missing; all requests are allowed. */ - public static final BaseRobotRules EMPTY_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_ALL); + public static final BaseRobotRules EMPTY_RULES = new SimpleRobotRules( + RobotRulesMode.ALLOW_ALL); /** - * A {@link BaseRobotRules} object appropriate for use when the - * {@code robots.txt} file is not fetched due to a {@code 403/Forbidden} - * response; all requests are disallowed. + * A {@link BaseRobotRules} object appropriate for use when the + * {@code robots.txt} file is not fetched due to a {@code 403/Forbidden} + * response; all requests are disallowed. */ - public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_NONE); + public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules( + RobotRulesMode.ALLOW_NONE); private static SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser(); private Configuration conf; protected String agentNames; - public RobotRulesParser() { } + public RobotRulesParser() { + } public RobotRulesParser(Configuration conf) { setConf(conf); @@ -90,16 +93,18 @@ public abstract class RobotRulesParser i } agentNames = agentName; - // If there are any other agents specified, append those to the list of agents + // If there are any other agents specified, append those to the list of + // agents String otherAgents = conf.get("http.robots.agents"); - if(otherAgents != null && !otherAgents.trim().isEmpty()) { + if (otherAgents != null && !otherAgents.trim().isEmpty()) { StringTokenizer tok = new StringTokenizer(otherAgents, ","); StringBuilder sb = new StringBuilder(agentNames); while (tok.hasMoreTokens()) { String str = tok.nextToken().trim(); if (str.equals("*") || str.equals(agentName)) { // skip wildcard "*" or agent name itself - // (required for backward compatibility, cf. NUTCH-1715 and NUTCH-1718) + // (required for backward compatibility, cf. NUTCH-1715 and + // NUTCH-1718) } else { sb.append(",").append(str); } @@ -117,16 +122,23 @@ public abstract class RobotRulesParser i } /** - * Parses the robots content using the {@link SimpleRobotRulesParser} from crawler commons - * - * @param url A string containing url - * @param content Contents of the robots file in a byte array - * @param contentType The content type of the robots file - * @param robotName A string containing all the robots agent names used by parser for matching - * @return BaseRobotRules object + * Parses the robots content using the {@link SimpleRobotRulesParser} from + * crawler commons + * + * @param url + * A string containing url + * @param content + * Contents of the robots file in a byte array + * @param contentType + * The content type of the robots file + * @param robotName + * A string containing all the robots agent names used by parser for + * matching + * @return BaseRobotRules object */ - public BaseRobotRules parseRules (String url, byte[] content, String contentType, String robotName) { - return robotParser.parseContent(url, content, contentType, robotName); + public BaseRobotRules parseRules(String url, byte[] content, + String contentType, String robotName) { + return robotParser.parseContent(url, content, contentType, robotName); } public BaseRobotRules getRobotRulesSet(Protocol protocol, String url) { @@ -145,23 +157,29 @@ public abstract class RobotRulesParser i public static void main(String[] argv) { if (argv.length != 3) { - System.err.println("Usage: RobotRulesParser <robots-file> <url-file> <agent-names>\n"); - System.err.println(" <robots-file> - Input robots.txt file which will be parsed."); - System.err.println(" <url-file> - Contains input URLs (1 per line) which are tested against the rules."); - System.err.println(" <agent-names> - Input agent names. Multiple agent names can be provided using"); - System.err.println(" comma as a delimiter without any spaces."); + System.err + .println("Usage: RobotRulesParser <robots-file> <url-file> <agent-names>\n"); + System.err + .println(" <robots-file> - Input robots.txt file which will be parsed."); + System.err + .println(" <url-file> - Contains input URLs (1 per line) which are tested against the rules."); + System.err + .println(" <agent-names> - Input agent names. Multiple agent names can be provided using"); + System.err + .println(" comma as a delimiter without any spaces."); System.exit(-1); } try { byte[] robotsBytes = Files.toByteArray(new File(argv[0])); - BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", argv[2]); + BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, + "text/plain", argv[2]); LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1])); String testPath = testsIn.readLine().trim(); while (testPath != null) { - System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") + - ":\t" + testPath); + System.out.println((rules.isAllowed(testPath) ? "allowed" + : "not allowed") + ":\t" + testPath); testPath = testsIn.readLine(); } testsIn.close(); Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/protocol/package-info.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/package-info.java Fri Jan 9 06:34:33 2015 @@ -20,3 +20,4 @@ * see also {@link org.apache.nutch.net.protocols}. */ package org.apache.nutch.protocol; +
