sr...

lewismc Thu, 08 Jan 2015 22:35:25 -0800

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java 
Fri Jan  9 06:34:33 2015
@@ -22,44 +22,52 @@ public interface ParseStatusCodes {
   // Primary status codes:
 
   /** Parsing was not performed. */
-  public static final byte NOTPARSED       = 0;
+  public static final byte NOTPARSED = 0;
   /** Parsing succeeded. */
-  public static final byte SUCCESS         = 1;
+  public static final byte SUCCESS = 1;
   /** General failure. There may be a more specific error message in 
arguments. */
-  public static final byte FAILED          = 2;
+  public static final byte FAILED = 2;
 
-  public static final String[] majorCodes = {
-    "notparsed",
-    "success",
-    "failed"
-  };
+  public static final String[] majorCodes = { "notparsed", "success", "failed" 
};
 
   // Secondary success codes go here:
 
-  public static final short SUCCESS_OK                = 0;
+  public static final short SUCCESS_OK = 0;
 
-  /** Parsed content contains a directive to redirect to another URL.
-   * The target URL can be retrieved from the arguments.
+  /**
+   * Parsed content contains a directive to redirect to another URL. The target
+   * URL can be retrieved from the arguments.
    */
-  public static final short SUCCESS_REDIRECT          = 100;
+  public static final short SUCCESS_REDIRECT = 100;
 
   // Secondary failure codes go here:
 
-  /** Parsing failed. An Exception occured (which may be retrieved from the 
arguments). */
-  public static final short FAILED_EXCEPTION          = 200;
-  /** Parsing failed. Content was truncated, but the parser cannot handle 
incomplete content. */
-  public static final short FAILED_TRUNCATED          = 202;
-  /** Parsing failed. Invalid format - the content may be corrupted or of 
wrong type. */
-  public static final short FAILED_INVALID_FORMAT     = 203;
-  /** Parsing failed. Other related parts of the content are needed to complete
+  /**
+   * Parsing failed. An Exception occured (which may be retrieved from the
+   * arguments).
+   */
+  public static final short FAILED_EXCEPTION = 200;
+  /**
+   * Parsing failed. Content was truncated, but the parser cannot handle
+   * incomplete content.
+   */
+  public static final short FAILED_TRUNCATED = 202;
+  /**
+   * Parsing failed. Invalid format - the content may be corrupted or of wrong
+   * type.
+   */
+  public static final short FAILED_INVALID_FORMAT = 203;
+  /**
+   * Parsing failed. Other related parts of the content are needed to complete
    * parsing. The list of URLs to missing parts may be provided in arguments.
    * The Fetcher may decide to fetch these parts at once, then put them into
    * Content.metadata, and supply them for re-parsing.
    */
-  public static final short FAILED_MISSING_PARTS      = 204;
-  /** Parsing failed. There was no content to be parsed - probably caused
-   * by errors at protocol stage.
+  public static final short FAILED_MISSING_PARTS = 204;
+  /**
+   * Parsing failed. There was no content to be parsed - probably caused by
+   * errors at protocol stage.
    */
-  public static final short FAILED_MISSING_CONTENT    = 205;
-  
+  public static final short FAILED_MISSING_CONTENT = 205;
+
 }


Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java 
Fri Jan  9 06:34:33 2015
@@ -29,10 +29,10 @@ import java.util.List;
 public class ParseStatusUtils {
 
   public static ParseStatus STATUS_SUCCESS = ParseStatus.newBuilder().build();
-  public static final HashMap<Short,String> minorCodes = new 
HashMap<Short,String>();
+  public static final HashMap<Short, String> minorCodes = new HashMap<Short, 
String>();
 
   static {
-    STATUS_SUCCESS.setMajorCode((int)ParseStatusCodes.SUCCESS);
+    STATUS_SUCCESS.setMajorCode((int) ParseStatusCodes.SUCCESS);
     minorCodes.put(ParseStatusCodes.SUCCESS_OK, "ok");
     minorCodes.put(ParseStatusCodes.SUCCESS_REDIRECT, "redirect");
     minorCodes.put(ParseStatusCodes.FAILED_EXCEPTION, "exception");
@@ -49,8 +49,9 @@ public class ParseStatusUtils {
     return status.getMajorCode() == ParseStatusCodes.SUCCESS;
   }
 
-  /** A convenience method. Return a String representation of the first
-   * argument, or null.
+  /**
+   * A convenience method. Return a String representation of the first 
argument,
+   * or null.
    */
   public static String getMessage(ParseStatus status) {
     List<CharSequence> args = status.getArgs();
@@ -77,29 +78,30 @@ public class ParseStatusUtils {
 
   public static Parse getEmptyParse(Exception e, Configuration conf) {
     ParseStatus status = ParseStatus.newBuilder().build();
-    status.setMajorCode((int)ParseStatusCodes.FAILED);
-    status.setMinorCode((int)ParseStatusCodes.FAILED_EXCEPTION);
+    status.setMajorCode((int) ParseStatusCodes.FAILED);
+    status.setMinorCode((int) ParseStatusCodes.FAILED_EXCEPTION);
     status.getArgs().add(new Utf8(e.toString()));
 
     return new Parse("", "", new Outlink[0], status);
   }
 
-  public static Parse getEmptyParse(int minorCode, String message, 
Configuration conf) {
+  public static Parse getEmptyParse(int minorCode, String message,
+      Configuration conf) {
     ParseStatus status = ParseStatus.newBuilder().build();
-    status.setMajorCode((int)ParseStatusCodes.FAILED);
+    status.setMajorCode((int) ParseStatusCodes.FAILED);
     status.setMinorCode(minorCode);
     status.getArgs().add(new Utf8(message));
 
     return new Parse("", "", new Outlink[0], status);
   }
-  
+
   public static String toString(ParseStatus status) {
     if (status == null) {
       return "(null)";
     }
     StringBuilder sb = new StringBuilder();
-    sb.append(ParseStatusCodes.majorCodes[status.getMajorCode()] +
-        "/" + minorCodes.get(status.getMinorCode().shortValue()));
+    sb.append(ParseStatusCodes.majorCodes[status.getMajorCode()] + "/"
+        + minorCodes.get(status.getMinorCode().shortValue()));
     sb.append(" (" + status.getMajorCode() + "/" + status.getMinorCode() + 
")");
     sb.append(", args=[");
     List<CharSequence> args = status.getArgs();
@@ -107,7 +109,8 @@ public class ParseStatusUtils {
       int i = 0;
       Iterator<CharSequence> it = args.iterator();
       while (it.hasNext()) {
-        if (i > 0) sb.append(',');
+        if (i > 0)
+          sb.append(',');
         sb.append(it.next());
         i++;
       }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Fri Jan  
9 06:34:33 2015
@@ -49,7 +49,7 @@ import java.util.concurrent.TimeUnit;
  * A Utility class containing methods to simply perform parsing utilities such
  * as iterating through a preferred list of {@link Parser}s to obtain
  * {@link Parse} objects.
- *
+ * 
  * @author mattmann
  * @author J&eacute;r&ocirc;me Charron
  * @author S&eacute;bastien Le Callonnec
@@ -60,7 +60,7 @@ public class ParseUtil extends Configure
   public static final Logger LOG = LoggerFactory.getLogger(ParseUtil.class);
 
   private static final int DEFAULT_MAX_PARSE_TIME = 30;
-  
+
   private Configuration conf;
   private Signature sig;
   private URLFilters filters;
@@ -71,9 +71,9 @@ public class ParseUtil extends Configure
   /** Parser timeout set to 30 sec by default. Set -1 to deactivate **/
   private int maxParseTime;
   private ExecutorService executorService;
-  
+
   /**
-   *
+   * 
    * @param conf
    */
   public ParseUtil(Configuration conf) {
@@ -90,15 +90,16 @@ public class ParseUtil extends Configure
   public void setConf(Configuration conf) {
     this.conf = conf;
     parserFactory = new ParserFactory(conf);
-    maxParseTime=conf.getInt("parser.timeout", DEFAULT_MAX_PARSE_TIME);
+    maxParseTime = conf.getInt("parser.timeout", DEFAULT_MAX_PARSE_TIME);
     sig = SignatureFactory.getSignature(conf);
     filters = new URLFilters(conf);
     normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
     int maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
-    maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : 
maxOutlinksPerPage;
+    maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
+        : maxOutlinksPerPage;
     ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
     executorService = Executors.newCachedThreadPool(new ThreadFactoryBuilder()
-      .setNameFormat("parse-%d").setDaemon(true).build());
+        .setNameFormat("parse-%d").setDaemon(true).build());
   }
 
   /**
@@ -106,11 +107,13 @@ public class ParseUtil extends Configure
    * until a successful parse is performed and a {@link Parse} object is
    * returned. If the parse is unsuccessful, a message is logged to the
    * <code>WARNING</code> level, and an empty parse is returned.
-   *
-   * @throws ParserNotFound If there is no suitable parser found. 
-   * @throws ParseException If there is an error parsing.
+   * 
+   * @throws ParserNotFound
+   *           If there is no suitable parser found.
+   * @throws ParseException
+   *           If there is an error parsing.
    */
-  public Parse parse(String url, WebPage page) throws ParserNotFound, 
+  public Parse parse(String url, WebPage page) throws ParserNotFound,
       ParseException {
     Parser[] parsers = null;
 
@@ -118,28 +121,29 @@ public class ParseUtil extends Configure
 
     parsers = this.parserFactory.getParsers(contentType, url);
 
-    for (int i=0; i<parsers.length; i++) {
+    for (int i = 0; i < parsers.length; i++) {
       if (LOG.isDebugEnabled()) {
         LOG.debug("Parsing [" + url + "] with [" + parsers[i] + "]");
       }
       Parse parse = null;
-      
-      if (maxParseTime!=-1)
-         parse = runParser(parsers[i], url, page);
-      else 
-         parse = parsers[i].getParse(url, page);
-      
-      if (parse!=null && ParseStatusUtils.isSuccess(parse.getParseStatus())) {
+
+      if (maxParseTime != -1)
+        parse = runParser(parsers[i], url, page);
+      else
+        parse = parsers[i].getParse(url, page);
+
+      if (parse != null && ParseStatusUtils.isSuccess(parse.getParseStatus())) 
{
         return parse;
       }
     }
 
-    LOG.warn("Unable to successfully parse content " + url +
-        " of type " + contentType);
-    return ParseStatusUtils.getEmptyParse(new ParseException("Unable to 
successfully parse content"), null);
+    LOG.warn("Unable to successfully parse content " + url + " of type "
+        + contentType);
+    return ParseStatusUtils.getEmptyParse(new ParseException(
+        "Unable to successfully parse content"), null);
   }
-  
-  private Parse runParser(Parser p, String url, WebPage page) {    
+
+  private Parse runParser(Parser p, String url, WebPage page) {
     ParseCallable pc = new ParseCallable(p, page, url);
     Future<Parse> task = executorService.submit(pc);
     Parse res = null;
@@ -155,8 +159,9 @@ public class ParseUtil extends Configure
   }
 
   /**
-   * Parses given web page and stores parsed content within page. Puts
-   * a meta-redirect to outlinks.
+   * Parses given web page and stores parsed content within page. Puts a
+   * meta-redirect to outlinks.
+   * 
    * @param key
    * @param page
    */
@@ -165,7 +170,8 @@ public class ParseUtil extends Configure
     byte status = page.getStatus().byteValue();
     if (status != CrawlStatus.STATUS_FETCHED) {
       if (LOG.isDebugEnabled()) {
-        LOG.debug("Skipping " + url + " as status is: " + 
CrawlStatus.getName(status));
+        LOG.debug("Skipping " + url + " as status is: "
+            + CrawlStatus.getName(status));
       }
       return;
     }
@@ -213,7 +219,8 @@ public class ParseUtil extends Configure
           return;
         }
         page.getOutlinks().put(new Utf8(newUrl), new Utf8());
-        page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED, 
TableUtil.YES_VAL);
+        page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED,
+            TableUtil.YES_VAL);
         if (newUrl == null || newUrl.equals(url)) {
           String reprUrl = URLUtil.chooseRepr(url, newUrl,
               refreshTime < FetcherJob.PERM_REFRESH_TIME);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/Parser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/Parser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/Parser.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/Parser.java Fri Jan  9 
06:34:33 2015
@@ -22,9 +22,10 @@ import org.apache.hadoop.conf.Configurab
 import org.apache.nutch.plugin.FieldPluggable;
 import org.apache.nutch.storage.WebPage;
 
-/** A parser for content generated by a {@link 
org.apache.nutch.protocol.Protocol}
- * implementation.  This interface is implemented by extensions.  Nutch's core
- * contains no page parsing code.
+/**
+ * A parser for content generated by a
+ * {@link org.apache.nutch.protocol.Protocol} implementation. This interface is
+ * implemented by extensions. Nutch's core contains no page parsing code.
  */
 public interface Parser extends FieldPluggable, Configurable {
   /** The name of the extension point. */
@@ -34,8 +35,9 @@ public interface Parser extends FieldPlu
    * <p>
    * This method parses content in WebPage instance
    * </p>
-   *
-   * @param url Page's URL
+   * 
+   * @param url
+   *          Page's URL
    * @param page
    */
   Parse getParse(String url, WebPage page);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Fri 
Jan  9 06:34:33 2015
@@ -37,28 +37,30 @@ import java.util.Map;
 import java.util.Map.Entry;
 
 /**
- * Parser checker, useful for testing parser.
- * It also accurately reports possible fetching and 
- * parsing failures and presents protocol status signals to aid 
- * debugging. The tool enables us to retrieve the following data from 
- * any url:
+ * Parser checker, useful for testing parser. It also accurately reports
+ * possible fetching and parsing failures and presents protocol status signals
+ * to aid debugging. The tool enables us to retrieve the following data from 
any
+ * url:
  * <ol>
- * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content} 
type.</li>
- * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) 
and is used to remove
- * duplicates during the dedup procedure. 
- * It is calculated using {@link org.apache.nutch.crawl.MD5Signature} or
+ * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content}
+ * type.</li>
+ * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) 
and
+ * is used to remove duplicates during the dedup procedure. It is calculated
+ * using {@link org.apache.nutch.crawl.MD5Signature} or
  * {@link org.apache.nutch.crawl.TextProfileSignature}.</li>
  * <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
  * <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
  * <li><tt>Title</tt>: of the URL</li>
  * <li><tt>Outlinks</tt>: associated with the URL</li>
  * <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>,
- * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>, 
<i>Cache-Control</>, etc.</li>
+ * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>,
+ * <i>Cache-Control</>, etc.</li>
  * <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>,
  * <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li>
- * <li><tt>ParseText</tt>: The page parse text which varies in length 
depdnecing on 
- * <code>content.length</code> configuration.</li>
+ * <li><tt>ParseText</tt>: The page parse text which varies in length 
depdnecing
+ * on <code>content.length</code> configuration.</li>
  * </ol>
+ * 
  * @author John Xing
  */
 
@@ -107,7 +109,7 @@ public class ParserChecker implements To
 
     ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
 
-    if(!protocolOutput.getStatus().isSuccess()) {
+    if (!protocolOutput.getStatus().isSuccess()) {
       LOG.error("Fetch failed with protocol status: "
           + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
           + ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
@@ -155,7 +157,6 @@ public class ParserChecker implements To
       LOG.info("signature: " + StringUtil.toHexString(signature));
     }
 
-
     LOG.info("---------\nUrl\n---------------\n");
     System.out.print(url + "\n");
     LOG.info("---------\nMetadata\n---------\n");
@@ -167,7 +168,7 @@ public class ParserChecker implements To
       while (iterator.hasNext()) {
         Entry<CharSequence, ByteBuffer> entry = iterator.next();
         sb.append(entry.getKey().toString()).append(" : \t")
-        .append(Bytes.toString(entry.getValue())).append("\n");
+            .append(Bytes.toString(entry.getValue())).append("\n");
       }
       System.out.print(sb.toString());
     }
@@ -182,12 +183,12 @@ public class ParserChecker implements To
       Map<CharSequence, CharSequence> headers = page.getHeaders();
       StringBuffer headersb = new StringBuffer();
       if (metadata != null) {
-        Iterator<Entry<CharSequence, CharSequence>> iterator = 
headers.entrySet()
-            .iterator();
+        Iterator<Entry<CharSequence, CharSequence>> iterator = headers
+            .entrySet().iterator();
         while (iterator.hasNext()) {
           Entry<CharSequence, CharSequence> entry = iterator.next();
           headersb.append(entry.getKey().toString()).append(" : \t")
-          .append(entry.getValue()).append("\n");
+              .append(entry.getValue()).append("\n");
         }
         System.out.print(headersb.toString());
       }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserFactory.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserFactory.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserFactory.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserFactory.java Fri 
Jan  9 06:34:33 2015
@@ -34,8 +34,7 @@ import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.ObjectCache;
 
-
-/** Creates and caches {@link Parser} plugins.*/
+/** Creates and caches {@link Parser} plugins. */
 public final class ParserFactory {
 
   public static final Logger LOG = 
LoggerFactory.getLogger(ParserFactory.class);
@@ -44,8 +43,7 @@ public final class ParserFactory {
   public static final String DEFAULT_PLUGIN = "*";
 
   /** Empty extension list for caching purposes. */
-  private final List<Extension> EMPTY_EXTENSION_LIST =
-    new ArrayList<Extension>();
+  private final List<Extension> EMPTY_EXTENSION_LIST = new 
ArrayList<Extension>();
 
   private final Configuration conf;
   private final ExtensionPoint extensionPoint;
@@ -56,10 +54,12 @@ public final class ParserFactory {
     ObjectCache objectCache = ObjectCache.get(conf);
     this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
         Parser.X_POINT_ID);
-    this.parsePluginList = 
(ParsePluginList)objectCache.getObject(ParsePluginList.class.getName());
+    this.parsePluginList = (ParsePluginList) objectCache
+        .getObject(ParsePluginList.class.getName());
     if (this.parsePluginList == null) {
       this.parsePluginList = new ParsePluginsReader().parse(conf);
-      objectCache.setObject(ParsePluginList.class.getName(), 
this.parsePluginList);
+      objectCache.setObject(ParsePluginList.class.getName(),
+          this.parsePluginList);
     }
 
     if (this.extensionPoint == null) {
@@ -71,33 +71,34 @@ public final class ParserFactory {
     }
   }
 
-
   /**
    * Function returns an array of {@link Parser}s for a given content type.
-   *
+   * 
    * The function consults the internal list of parse plugins for the
-   * ParserFactory to determine the list of pluginIds, then gets the
-   * appropriate extension points to instantiate as {@link Parser}s.
-   *
-   * @param contentType The contentType to return the <code>Array</code>
-   *                    of {@link Parser}s for.
-   * @param url The url for the content that may allow us to get the type from
-   *            the file suffix.
+   * ParserFactory to determine the list of pluginIds, then gets the 
appropriate
+   * extension points to instantiate as {@link Parser}s.
+   * 
+   * @param contentType
+   *          The contentType to return the <code>Array</code> of {@link 
Parser}
+   *          s for.
+   * @param url
+   *          The url for the content that may allow us to get the type from 
the
+   *          file suffix.
    * @return An <code>Array</code> of {@link Parser}s for the given 
contentType.
    *         If there were plugins mapped to a contentType via the
-   *         <code>parse-plugins.xml</code> file, but never enabled via
-   *         the <code>plugin.includes</code> Nutch conf, then those plugins
-   *         won't be part of this array, i.e., they will be skipped.
-   *         So, if the ordered list of parsing plugins for
-   *         <code>text/plain</code> was <code>[parse-text,parse-html,
+   *         <code>parse-plugins.xml</code> file, but never enabled via the
+   *         <code>plugin.includes</code> Nutch conf, then those plugins won't
+   *         be part of this array, i.e., they will be skipped. So, if the
+   *         ordered list of parsing plugins for <code>text/plain</code> was
+   *         <code>[parse-text,parse-html,
    *         parse-rtf]</code>, and only <code>parse-html</code> and
    *         <code>parse-rtf</code> were enabled via
-   *         <code>plugin.includes</code>, then this ordered Array would
-   *         consist of two {@link Parser} interfaces,
+   *         <code>plugin.includes</code>, then this ordered Array would 
consist
+   *         of two {@link Parser} interfaces,
    *         <code>[parse-html, parse-rtf]</code>.
    */
   public Parser[] getParsers(String contentType, String url)
-  throws ParserNotFound {
+      throws ParserNotFound {
 
     List<Parser> parsers = null;
     List<Extension> parserExts = null;
@@ -107,7 +108,7 @@ public final class ParserFactory {
     // TODO once the MimeTypes is available
     // parsers = getExtensions(MimeUtils.map(contentType));
     // if (parsers != null) {
-    //   return parsers;
+    // return parsers;
     // }
     // Last Chance: Guess content-type from file url...
     // parsers = getExtensions(MimeUtils.getMimeType(url));
@@ -118,49 +119,50 @@ public final class ParserFactory {
     }
 
     parsers = new ArrayList<Parser>(parserExts.size());
-    for (Extension ext : parserExts){
+    for (Extension ext : parserExts) {
       Parser p = null;
       try {
-        //check to see if we've cached this parser instance yet
+        // check to see if we've cached this parser instance yet
         p = (Parser) objectCache.getObject(ext.getId());
         if (p == null) {
           // go ahead and instantiate it and then cache it
           p = (Parser) ext.getExtensionInstance();
-          objectCache.setObject(ext.getId(),p);
+          objectCache.setObject(ext.getId(), p);
         }
         parsers.add(p);
       } catch (PluginRuntimeException e) {
         if (LOG.isWarnEnabled()) {
           LOG.warn("ParserFactory:PluginRuntimeException when "
-                 + "initializing parser plugin "
-                 + ext.getDescriptor().getPluginId()
-                 + " instance in getParsers "
-                 + "function: attempting to continue instantiating parsers: ", 
e);
+              + "initializing parser plugin "
+              + ext.getDescriptor().getPluginId() + " instance in getParsers "
+              + "function: attempting to continue instantiating parsers: ", e);
         }
       }
     }
-    return parsers.toArray(new Parser[]{});
+    return parsers.toArray(new Parser[] {});
   }
 
   /**
    * Function returns a {@link Parser} instance with the specified
-   * <code>extId</code>, representing its extension ID. If the Parser
-   * instance isn't found, then the function throws a
-   * <code>ParserNotFound</code> exception. If the function is able to find
-   * the {@link Parser} in the internal <code>PARSER_CACHE</code> then it
-   * will return the already instantiated Parser. Otherwise, if it has to
-   * instantiate the Parser itself , then this function will cache that Parser
-   * in the internal <code>PARSER_CACHE</code>.
-   *
-   * @param id The string extension ID (e.g.,
-   *        "org.apache.nutch.parse.rss.RSSParser",
-   *        "org.apache.nutch.parse.rtf.RTFParseFactory") of the {@link Parser}
-   *        implementation to return.
+   * <code>extId</code>, representing its extension ID. If the Parser instance
+   * isn't found, then the function throws a <code>ParserNotFound</code>
+   * exception. If the function is able to find the {@link Parser} in the
+   * internal <code>PARSER_CACHE</code> then it will return the already
+   * instantiated Parser. Otherwise, if it has to instantiate the Parser itself
+   * , then this function will cache that Parser in the internal
+   * <code>PARSER_CACHE</code>.
+   * 
+   * @param id
+   *          The string extension ID (e.g.,
+   *          "org.apache.nutch.parse.rss.RSSParser",
+   *          "org.apache.nutch.parse.rtf.RTFParseFactory") of the
+   *          {@link Parser} implementation to return.
    * @return A {@link Parser} implementation specified by the parameter
    *         <code>id</code>.
-   * @throws ParserNotFound If the Parser is not found (i.e., registered with
-   *         the extension point), or if the there a
-   *         {@link PluginRuntimeException} instantiating the {@link Parser}.
+   * @throws ParserNotFound
+   *           If the Parser is not found (i.e., registered with the extension
+   *           point), or if the there a {@link PluginRuntimeException}
+   *           instantiating the {@link Parser}.
    */
   public Parser getParserById(String id) throws ParserNotFound {
 
@@ -184,7 +186,7 @@ public final class ParserFactory {
     if (objectCache.getObject(parserExt.getId()) != null) {
       return (Parser) objectCache.getObject(parserExt.getId());
 
-    // if not found in cache, instantiate the Parser
+      // if not found in cache, instantiate the Parser
     } else {
       try {
         Parser p = (Parser) parserExt.getExtensionInstance();
@@ -192,9 +194,9 @@ public final class ParserFactory {
         return p;
       } catch (PluginRuntimeException e) {
         if (LOG.isWarnEnabled()) {
-          LOG.warn("Canno initialize parser " +
-                   parserExt.getDescriptor().getPluginId() +
-                   " (cause: " + e.toString());
+          LOG.warn("Canno initialize parser "
+              + parserExt.getDescriptor().getPluginId() + " (cause: "
+              + e.toString());
         }
         throw new ParserNotFound("Cannot init parser for id [" + id + "]");
       }
@@ -212,7 +214,7 @@ public final class ParserFactory {
           columns.addAll(pluginFields);
         }
       } catch (PluginRuntimeException e) {
-        LOG.error("PluginRuntimeException",e);
+        LOG.error("PluginRuntimeException", e);
       }
     }
     return columns;
@@ -220,10 +222,11 @@ public final class ParserFactory {
 
   /**
    * Finds the best-suited parse plugin for a given contentType.
-   *
-   * @param contentType Content-Type for which we seek a parse plugin.
-   * @return a list of extensions to be used for this contentType.
-   *         If none, returns <code>null</code>.
+   * 
+   * @param contentType
+   *          Content-Type for which we seek a parse plugin.
+   * @return a list of extensions to be used for this contentType. If none,
+   *         returns <code>null</code>.
    */
   @SuppressWarnings("unchecked")
   protected List<Extension> getExtensions(String contentType) {
@@ -246,8 +249,8 @@ public final class ParserFactory {
       if (extensions != null) {
         objectCache.setObject(type, extensions);
       } else {
-       // Put the empty extension list into cache
-       // to remember we don't know any related extension.
+        // Put the empty extension list into cache
+        // to remember we don't know any related extension.
         objectCache.setObject(type, EMPTY_EXTENSION_LIST);
       }
     }
@@ -256,22 +259,24 @@ public final class ParserFactory {
 
   /**
    * searches a list of suitable parse plugins for the given contentType.
-   * <p>It first looks for a preferred plugin defined in the parse-plugin
-   * file.  If none is found, it returns a list of default plugins.
-   *
-   * @param contentType Content-Type for which we seek a parse plugin.
-   * @return List - List of extensions to be used for this contentType.
-   *                If none, returns null.
+   * <p>
+   * It first looks for a preferred plugin defined in the parse-plugin file. If
+   * none is found, it returns a list of default plugins.
+   * 
+   * @param contentType
+   *          Content-Type for which we seek a parse plugin.
+   * @return List - List of extensions to be used for this contentType. If 
none,
+   *         returns null.
    */
   private List<Extension> findExtensions(String contentType) {
 
     Extension[] extensions = this.extensionPoint.getExtensions();
 
     // Look for a preferred plugin.
-    List<String> parsePluginList =
-      this.parsePluginList.getPluginList(contentType);
-    List<Extension> extensionList =
-      matchExtensions(parsePluginList, extensions, contentType);
+    List<String> parsePluginList = this.parsePluginList
+        .getPluginList(contentType);
+    List<Extension> extensionList = matchExtensions(parsePluginList,
+        extensions, contentType);
     if (extensionList != null) {
       return extensionList;
     }
@@ -284,20 +289,23 @@ public final class ParserFactory {
   /**
    * Tries to find a suitable parser for the given contentType.
    * <ol>
-   * <li>It checks if a parser which accepts the contentType
-   * can be found in the <code>plugins</code> list;</li>
-   * <li>If this list is empty, it tries to find amongst the loaded
-   * extensions whether some of them might suit and warns the user.</li>
+   * <li>It checks if a parser which accepts the contentType can be found in 
the
+   * <code>plugins</code> list;</li>
+   * <li>If this list is empty, it tries to find amongst the loaded extensions
+   * whether some of them might suit and warns the user.</li>
    * </ol>
-   * @param plugins List of candidate plugins.
-   * @param extensions Array of loaded extensions.
-   * @param contentType Content-Type for which we seek a parse plugin.
-   * @return List - List of extensions to be used for this contentType.
-   *                If none, returns null.
+   * 
+   * @param plugins
+   *          List of candidate plugins.
+   * @param extensions
+   *          Array of loaded extensions.
+   * @param contentType
+   *          Content-Type for which we seek a parse plugin.
+   * @return List - List of extensions to be used for this contentType. If 
none,
+   *         returns null.
    */
   private List<Extension> matchExtensions(List<String> plugins,
-                               Extension[] extensions,
-                               String contentType) {
+      Extension[] extensions, String contentType) {
 
     List<Extension> extList = new ArrayList<Extension>();
     if (plugins != null) {
@@ -315,7 +323,7 @@ public final class ParserFactory {
         // in either case, LOG the appropriate error message to WARN level
 
         if (ext == null) {
-          //try to get it just by its pluginId
+          // try to get it just by its pluginId
           ext = getExtension(extensions, parsePluginId);
 
           if (LOG.isWarnEnabled()) {
@@ -323,17 +331,17 @@ public final class ParserFactory {
               // plugin was enabled via plugin.includes
               // its plugin.xml just doesn't claim to support that
               // particular mimeType
-              LOG.warn("ParserFactory:Plugin: " + parsePluginId +
-                       " mapped to contentType " + contentType +
-                       " via parse-plugins.xml, but " + "its plugin.xml " +
-                       "file does not claim to support contentType: " +
-                       contentType);
+              LOG.warn("ParserFactory:Plugin: " + parsePluginId
+                  + " mapped to contentType " + contentType
+                  + " via parse-plugins.xml, but " + "its plugin.xml "
+                  + "file does not claim to support contentType: "
+                  + contentType);
             } else {
               // plugin wasn't enabled via plugin.includes
-              LOG.warn("ParserFactory: Plugin: " + parsePluginId +
-                       " mapped to contentType " + contentType +
-                       " via parse-plugins.xml, but not enabled via " +
-                       "plugin.includes in nutch-default.xml");
+              LOG.warn("ParserFactory: Plugin: " + parsePluginId
+                  + " mapped to contentType " + contentType
+                  + " via parse-plugins.xml, but not enabled via "
+                  + "plugin.includes in nutch-default.xml");
             }
           }
         }
@@ -353,12 +361,12 @@ public final class ParserFactory {
       // any extensions where this is the case, throw a
       // NotMappedParserException
 
-      for (int i=0; i<extensions.length; i++) {
-       if ("*".equals(extensions[i].getAttribute("contentType"))){
+      for (int i = 0; i < extensions.length; i++) {
+        if ("*".equals(extensions[i].getAttribute("contentType"))) {
           extList.add(0, extensions[i]);
-        }
-       else if (extensions[i].getAttribute("contentType") != null
-            && 
contentType.matches(escapeContentType(extensions[i].getAttribute("contentType"))))
 {
+        } else if (extensions[i].getAttribute("contentType") != null
+            && contentType.matches(escapeContentType(extensions[i]
+                .getAttribute("contentType")))) {
           extList.add(extensions[i]);
         }
       }
@@ -367,21 +375,23 @@ public final class ParserFactory {
         if (LOG.isInfoEnabled()) {
           StringBuffer extensionsIDs = new StringBuffer("[");
           boolean isFirst = true;
-          for (Extension ext : extList){
-                 if (!isFirst) extensionsIDs.append(" - ");
-                 else isFirst=false;
-                 extensionsIDs.append(ext.getId());
+          for (Extension ext : extList) {
+            if (!isFirst)
+              extensionsIDs.append(" - ");
+            else
+              isFirst = false;
+            extensionsIDs.append(ext.getId());
           }
-         extensionsIDs.append("]");
-          LOG.info("The parsing plugins: " + extensionsIDs.toString() +
-                   " are enabled via the plugin.includes system " +
-                   "property, and all claim to support the content type " +
-                   contentType + ", but they are not mapped to it  in the " +
-                   "parse-plugins.xml file");
+          extensionsIDs.append("]");
+          LOG.info("The parsing plugins: " + extensionsIDs.toString()
+              + " are enabled via the plugin.includes system "
+              + "property, and all claim to support the content type "
+              + contentType + ", but they are not mapped to it  in the "
+              + "parse-plugins.xml file");
         }
       } else if (LOG.isDebugEnabled()) {
-        LOG.debug("ParserFactory:No parse plugins mapped or enabled for " +
-                  "contentType " + contentType);
+        LOG.debug("ParserFactory:No parse plugins mapped or enabled for "
+            + "contentType " + contentType);
       }
     }
 
@@ -389,23 +399,22 @@ public final class ParserFactory {
   }
 
   private String escapeContentType(String contentType) {
-       // Escapes contentType in order to use as a regex 
-       // (and keep backwards compatibility).
-       // This enables to accept multiple types for a single parser. 
-       return contentType.replace("+", "\\+").replace(".", "\\.");
-       }
-
+    // Escapes contentType in order to use as a regex
+    // (and keep backwards compatibility).
+    // This enables to accept multiple types for a single parser.
+    return contentType.replace("+", "\\+").replace(".", "\\.");
+  }
 
-       private boolean match(Extension extension, String id, String type) {
-    return (id.equals(extension.getId())) &&
-            (extension.getAttribute("contentType").equals("*") ||
-             
type.matches(escapeContentType(extension.getAttribute("contentType"))) ||
-             type.equals(DEFAULT_PLUGIN));
+  private boolean match(Extension extension, String id, String type) {
+    return (id.equals(extension.getId()))
+        && (extension.getAttribute("contentType").equals("*")
+            || type.matches(escapeContentType(extension
+                .getAttribute("contentType"))) || type.equals(DEFAULT_PLUGIN));
   }
 
   /** Get an extension from its id and supported content-type. */
   private Extension getExtension(Extension[] list, String id, String type) {
-    for (int i=0; i<list.length; i++) {
+    for (int i = 0; i < list.length; i++) {
       if (match(list[i], id, type)) {
         return list[i];
       }
@@ -414,7 +423,7 @@ public final class ParserFactory {
   }
 
   private Extension getExtension(Extension[] list, String id) {
-    for (int i=0; i<list.length; i++) {
+    for (int i = 0; i < list.length; i++) {
       if (id.equals(list[i].getId())) {
         return list[i];
       }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java Fri Jan  
9 06:34:33 2015
@@ -58,9 +58,9 @@ public class ParserJob extends NutchTool
 
   private static final String RESUME_KEY = "parse.job.resume";
   private static final String FORCE_KEY = "parse.job.force";
-  
+
   public static final String SKIP_TRUNCATED = "parser.skip.truncated";
-  
+
   private static final Utf8 REPARSE = new Utf8("-reparse");
 
   private static final Collection<WebPage.Field> FIELDS = new 
HashSet<WebPage.Field>();
@@ -79,9 +79,8 @@ public class ParserJob extends NutchTool
     FIELDS.add(WebPage.Field.HEADERS);
   }
 
-
-  public static class ParserMapper 
-      extends GoraMapper<String, WebPage, String, WebPage> {
+  public static class ParserMapper extends
+      GoraMapper<String, WebPage, String, WebPage> {
     private ParseUtil parseUtil;
 
     private boolean shouldResume;
@@ -91,15 +90,16 @@ public class ParserJob extends NutchTool
     private Utf8 batchId;
 
     private boolean skipTruncated;
-    
+
     @Override
     public void setup(Context context) throws IOException {
       Configuration conf = context.getConfiguration();
       parseUtil = new ParseUtil(conf);
       shouldResume = conf.getBoolean(RESUME_KEY, false);
       force = conf.getBoolean(FORCE_KEY, false);
-      batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID, 
Nutch.ALL_BATCH_ID_STR));
-      skipTruncated=conf.getBoolean(SKIP_TRUNCATED, true);
+      batchId = new Utf8(
+          conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
+      skipTruncated = conf.getBoolean(SKIP_TRUNCATED, true);
     }
 
     @Override
@@ -131,7 +131,6 @@ public class ParserJob extends NutchTool
       if (skipTruncated && isTruncated(unreverseKey, page)) {
         return;
       }
-      
 
       parseUtil.process(key, page);
       ParseStatus pstatus = page.getParseStatus();
@@ -141,9 +140,9 @@ public class ParserJob extends NutchTool
       }
 
       context.write(key, page);
-    }    
+    }
   }
-  
+
   public ParserJob() {
 
   }
@@ -151,20 +150,22 @@ public class ParserJob extends NutchTool
   public ParserJob(Configuration conf) {
     setConf(conf);
   }
-  
+
   /**
    * Checks if the page's content is truncated.
-   * @param url 
+   * 
+   * @param url
    * @param page
-   * @return If the page is truncated <code>true</code>. When it is not,
-   * or when it could be determined, <code>false</code>. 
+   * @return If the page is truncated <code>true</code>. When it is not, or 
when
+   *         it could be determined, <code>false</code>.
    */
   public static boolean isTruncated(String url, WebPage page) {
     ByteBuffer content = page.getContent();
     if (content == null) {
       return false;
     }
-    CharSequence lengthUtf8 = page.getHeaders().get(new 
Utf8(HttpHeaders.CONTENT_LENGTH));
+    CharSequence lengthUtf8 = page.getHeaders().get(
+        new Utf8(HttpHeaders.CONTENT_LENGTH));
     if (lengthUtf8 == null) {
       return false;
     }
@@ -186,7 +187,8 @@ public class ParserJob extends NutchTool
       return true;
     }
     if (LOG.isDebugEnabled()) {
-      LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + 
inHeaderSize);
+      LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize="
+          + inHeaderSize);
     }
     return false;
   }
@@ -198,8 +200,8 @@ public class ParserJob extends NutchTool
     ParseFilters parseFilters = new ParseFilters(conf);
 
     Collection<WebPage.Field> parsePluginFields = parserFactory.getFields();
-    Collection<WebPage.Field> signaturePluginFields =
-      SignatureFactory.getFields(conf);
+    Collection<WebPage.Field> signaturePluginFields = SignatureFactory
+        .getFields(conf);
     Collection<WebPage.Field> htmlParsePluginFields = parseFilters.getFields();
 
     if (parsePluginFields != null) {
@@ -226,11 +228,11 @@ public class ParserJob extends NutchTool
   }
 
   @Override
-  public Map<String,Object> run(Map<String,Object> args) throws Exception {
-    String batchId = (String)args.get(Nutch.ARG_BATCH);
-    Boolean shouldResume = (Boolean)args.get(Nutch.ARG_RESUME);
-    Boolean force = (Boolean)args.get(Nutch.ARG_FORCE);
-    
+  public Map<String, Object> run(Map<String, Object> args) throws Exception {
+    String batchId = (String) args.get(Nutch.ARG_BATCH);
+    Boolean shouldResume = (Boolean) args.get(Nutch.ARG_RESUME);
+    Boolean force = (Boolean) args.get(Nutch.ARG_FORCE);
+
     if (batchId != null) {
       getConf().set(GeneratorJob.BATCH_ID, batchId);
     }
@@ -241,17 +243,18 @@ public class ParserJob extends NutchTool
       getConf().setBoolean(FORCE_KEY, force);
     }
     LOG.info("ParserJob: resuming:\t" + getConf().getBoolean(RESUME_KEY, 
false));
-    LOG.info("ParserJob: forced reparse:\t" + getConf().getBoolean(FORCE_KEY, 
false));
+    LOG.info("ParserJob: forced reparse:\t"
+        + getConf().getBoolean(FORCE_KEY, false));
     if (batchId == null || batchId.equals(Nutch.ALL_BATCH_ID_STR)) {
       LOG.info("ParserJob: parsing all");
     } else {
       LOG.info("ParserJob: batchId:\t" + batchId);
     }
     currentJob = new NutchJob(getConf(), "parse");
-    
+
     Collection<WebPage.Field> fields = getFields(currentJob);
     MapFieldValueFilter<String, WebPage> batchIdFilter = 
getBatchIdFilter(batchId);
-       StorageUtils.initMapperJob(currentJob, fields, String.class, 
WebPage.class,
+    StorageUtils.initMapperJob(currentJob, fields, String.class, WebPage.class,
         ParserMapper.class, batchIdFilter);
     StorageUtils.initReducerJob(currentJob, IdentityPageReducer.class);
     currentJob.setNumReduceTasks(0);
@@ -275,20 +278,20 @@ public class ParserJob extends NutchTool
     return filter;
   }
 
-  public int parse(String batchId, boolean shouldResume, boolean force) throws 
Exception {
-    
+  public int parse(String batchId, boolean shouldResume, boolean force)
+      throws Exception {
+
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
     LOG.info("ParserJob: starting at " + sdf.format(start));
 
-    run(ToolUtil.toArgMap(
-        Nutch.ARG_BATCH, batchId,
-        Nutch.ARG_RESUME, shouldResume,
-        Nutch.ARG_FORCE, force));
+    run(ToolUtil.toArgMap(Nutch.ARG_BATCH, batchId, Nutch.ARG_RESUME,
+        shouldResume, Nutch.ARG_FORCE, force));
     LOG.info("ParserJob: success");
-    
+
     long finish = System.currentTimeMillis();
-    LOG.info("ParserJob: finished at " + sdf.format(finish) + ", time elapsed: 
" + TimingUtil.elapsedTime(start, finish));
+    LOG.info("ParserJob: finished at " + sdf.format(finish)
+        + ", time elapsed: " + TimingUtil.elapsedTime(start, finish));
     return 0;
   }
 
@@ -298,12 +301,18 @@ public class ParserJob extends NutchTool
     String batchId = null;
 
     if (args.length < 1) {
-      System.err.println("Usage: ParserJob (<batchId> | -all) [-crawlId <id>] 
[-resume] [-force]");
-      System.err.println("    <batchId>     - symbolic batch ID created by 
Generator");
-      System.err.println("    -crawlId <id> - the id to prefix the schemas to 
operate on, \n \t \t    (default: storage.crawl.id)");
-      System.err.println("    -all          - consider pages from all crawl 
jobs");
-      System.err.println("    -resume       - resume a previous incomplete 
job");
-      System.err.println("    -force        - force re-parsing even if a page 
is already parsed");
+      System.err
+          .println("Usage: ParserJob (<batchId> | -all) [-crawlId <id>] 
[-resume] [-force]");
+      System.err
+          .println("    <batchId>     - symbolic batch ID created by 
Generator");
+      System.err
+          .println("    -crawlId <id> - the id to prefix the schemas to 
operate on, \n \t \t    (default: storage.crawl.id)");
+      System.err
+          .println("    -all          - consider pages from all crawl jobs");
+      System.err
+          .println("    -resume       - resume a previous incomplete job");
+      System.err
+          .println("    -force        - force re-parsing even if a page is 
already parsed");
       return -1;
     }
     for (int i = 0; i < args.length; i++) {

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserNotFound.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserNotFound.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserNotFound.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserNotFound.java Fri 
Jan  9 06:34:33 2015
@@ -18,17 +18,17 @@ package org.apache.nutch.parse;
 
 public class ParserNotFound extends ParseException {
 
-  private static final long serialVersionUID=23993993939L;
+  private static final long serialVersionUID = 23993993939L;
   private String url;
   private String contentType;
 
-  public ParserNotFound(String message){
-    super(message);    
+  public ParserNotFound(String message) {
+    super(message);
   }
-  
+
   public ParserNotFound(String url, String contentType) {
-    this(url, contentType,
-         "parser not found for contentType="+contentType+" url="+url);
+    this(url, contentType, "parser not found for contentType=" + contentType
+        + " url=" + url);
   }
 
   public ParserNotFound(String url, String contentType, String message) {
@@ -37,6 +37,11 @@ public class ParserNotFound extends Pars
     this.contentType = contentType;
   }
 
-  public String getUrl() { return url; }
-  public String getContentType() { return contentType; }
+  public String getUrl() {
+    return url;
+  }
+
+  public String getContentType() {
+    return contentType;
+  }
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/package-info.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/package-info.java Fri 
Jan  9 06:34:33 2015
@@ -19,3 +19,4 @@
  * The {@link org.apache.nutch.parse.Parse Parse} interface and related 
classes.
  */
 package org.apache.nutch.parse;
+

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/CircularDependencyException.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/CircularDependencyException.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/CircularDependencyException.java
 (original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/CircularDependencyException.java
 Fri Jan  9 06:34:33 2015
@@ -16,7 +16,6 @@
  */
 package org.apache.nutch.plugin;
 
-
 /**
  * <code>CircularDependencyException</code> will be thrown if a circular
  * dependency is detected.

Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/Extension.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/Extension.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/Extension.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/Extension.java Fri Jan  
9 06:34:33 2015
@@ -94,8 +94,10 @@ public class Extension {
    * Adds a attribute and is only used until model creation at plugin system
    * start up.
    * 
-   * @param pKey a key
-   * @param pValue a value
+   * @param pKey
+   *          a key
+   * @param pValue
+   *          a value
    */
   public void addAttribute(String pKey, String pValue) {
     fAttributes.put(pKey, pValue);
@@ -105,7 +107,8 @@ public class Extension {
    * Sets the Class that implement the concret extension and is only used until
    * model creation at system start up.
    * 
-   * @param extensionClazz The extensionClasname to set
+   * @param extensionClazz
+   *          The extensionClasname to set
    */
   public void setClazz(String extensionClazz) {
     fClazz = extensionClazz;
@@ -115,7 +118,8 @@ public class Extension {
    * Sets the unique extension Id and is only used until model creation at
    * system start up.
    * 
-   * @param extensionID The extensionID to set
+   * @param extensionID
+   *          The extensionID to set
    */
   public void setId(String extensionID) {
     fId = extensionID;
@@ -147,10 +151,10 @@ public class Extension {
     // The same is in PluginRepository.getPluginInstance().
     // Suggested by Stefan Groschupf <[email protected]>
     synchronized (getId()) {
-      try {      
+      try {
         PluginRepository pluginRepository = PluginRepository.get(conf);
-        Class extensionClazz = 
-          pluginRepository.getCachedClass(fDescriptor, getClazz());
+        Class extensionClazz = pluginRepository.getCachedClass(fDescriptor,
+            getClazz());
         // lazy loading of Plugin in case there is no instance of the plugin
         // already.
         pluginRepository.getPluginInstance(getDescriptor());

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/ExtensionPoint.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/ExtensionPoint.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/ExtensionPoint.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/ExtensionPoint.java Fri 
Jan  9 06:34:33 2015
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 package org.apache.nutch.plugin;
+
 import java.util.ArrayList;
 
 /**
@@ -76,7 +77,8 @@ public class ExtensionPoint {
   /**
    * Sets the extensionPointId.
    * 
-   * @param pId extension point id
+   * @param pId
+   *          extension point id
    */
   private void setId(String pId) {
     ftId = pId;

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/MissingDependencyException.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/MissingDependencyException.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/MissingDependencyException.java
 (original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/MissingDependencyException.java
 Fri Jan  9 06:34:33 2015
@@ -17,8 +17,8 @@
 package org.apache.nutch.plugin;
 
 /**
- * <code>MissingDependencyException</code> will be thrown if a plugin
- * dependency cannot be found.
+ * <code>MissingDependencyException</code> will be thrown if a plugin 
dependency
+ * cannot be found.
  * 
  * @author J&eacute;r&ocirc;me Charron
  */

Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/Pluggable.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/Pluggable.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/Pluggable.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/Pluggable.java Fri Jan  
9 06:34:33 2015
@@ -17,15 +17,14 @@
 package org.apache.nutch.plugin;
 
 /**
- * Defines the capability of a class to be plugged into Nutch.
- * This is a common interface that must be implemented by all
- * Nutch Extension Points.
- *
+ * Defines the capability of a class to be plugged into Nutch. This is a common
+ * interface that must be implemented by all Nutch Extension Points.
+ * 
  * @author J&eacute;r&ocirc;me Charron
- *
+ * 
  * @see <a href="http://wiki.apache.org/nutch/AboutPlugins";>About Plugins</a>
- * @see <a href="package-summary.html#package_description">
- *      plugin package description</a>
+ * @see <a href="package-summary.html#package_description"> plugin package
+ *      description</a>
  */
 public interface Pluggable {
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/Plugin.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/Plugin.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/Plugin.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/Plugin.java Fri Jan  9 
06:34:33 2015
@@ -33,8 +33,8 @@ import org.apache.hadoop.conf.Configurat
  * The <code>Plugin</code> will be startuped and shutdown by the nutch plugin
  * management system.
  * 
- * A possible usecase of the <code>Plugin</code> implementation is to create
- * or close a database connection.
+ * A possible usecase of the <code>Plugin</code> implementation is to create or
+ * close a database connection.
  * 
  * @author joa23
  */
@@ -81,7 +81,8 @@ public class Plugin {
   }
 
   /**
-   * @param descriptor The descriptor to set
+   * @param descriptor
+   *          The descriptor to set
    */
   private void setDescriptor(PluginDescriptor descriptor) {
     fDescriptor = descriptor;

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginClassLoader.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginClassLoader.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginClassLoader.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginClassLoader.java 
Fri Jan  9 06:34:33 2015
@@ -45,11 +45,11 @@ public class PluginClassLoader extends U
    */
   public PluginClassLoader(URL[] urls, ClassLoader parent) {
     super(urls, parent);
-    
+
     this.urls = urls;
     this.parent = parent;
   }
-  
+
   @Override
   public int hashCode() {
     final int PRIME = 31;

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java 
Fri Jan  9 06:34:33 2015
@@ -30,12 +30,11 @@ import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 
 /**
- * The <code>PluginDescriptor</code> provide access to all meta information of
- * a nutch-plugin, as well to the internationalizable resources and the plugin
- * own classloader. There are meta information about <code>Plugin</code>,
- * <code>ExtensionPoint</code> and <code>Extension</code>. To provide
- * access to the meta data of a plugin via a descriptor allow a lazy loading
- * mechanism.
+ * The <code>PluginDescriptor</code> provide access to all meta information of 
a
+ * nutch-plugin, as well to the internationalizable resources and the plugin 
own
+ * classloader. There are meta information about <code>Plugin</code>,
+ * <code>ExtensionPoint</code> and <code>Extension</code>. To provide access to
+ * the meta data of a plugin via a descriptor allow a lazy loading mechanism.
  */
 public class PluginDescriptor {
   private String fPluginPath;
@@ -51,7 +50,8 @@ public class PluginDescriptor {
   private ArrayList<URL> fNotExportedLibs = new ArrayList<URL>();
   private ArrayList<Extension> fExtensions = new ArrayList<Extension>();
   private PluginClassLoader fClassLoader;
-  public static final Logger LOG = 
LoggerFactory.getLogger(PluginDescriptor.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(PluginDescriptor.class);
   private Configuration fConf;
 
   /**
@@ -204,7 +204,8 @@ public class PluginDescriptor {
   /**
    * Adds a dependency
    * 
-   * @param pId id of the dependent plugin
+   * @param pId
+   *          id of the dependent plugin
    */
   public void addDependency(String pId) {
     fDependencies.add(pId);
@@ -217,7 +218,8 @@ public class PluginDescriptor {
    */
   public void addExportedLibRelative(String pLibPath)
       throws MalformedURLException {
-    URL url = new File(getPluginPath() + File.separator + 
pLibPath).toURI().toURL();
+    URL url = new File(getPluginPath() + File.separator + pLibPath).toURI()
+        .toURL();
     fExportedLibs.add(url);
   }
 
@@ -246,7 +248,8 @@ public class PluginDescriptor {
    */
   public void addNotExportedLibRelative(String pLibPath)
       throws MalformedURLException {
-    URL url = new File(getPluginPath() + File.separator + 
pLibPath).toURI().toURL();
+    URL url = new File(getPluginPath() + File.separator + pLibPath).toURI()
+        .toURL();
     fNotExportedLibs.add(url);
   }
 
@@ -283,8 +286,8 @@ public class PluginDescriptor {
       LOG.debug(getPluginId() + " " + e.toString());
     }
     URL[] urls = arrayList.toArray(new URL[arrayList.size()]);
-    fClassLoader = new PluginClassLoader(urls, PluginDescriptor.class
-        .getClassLoader());
+    fClassLoader = new PluginClassLoader(urls,
+        PluginDescriptor.class.getClassLoader());
     return fClassLoader;
   }
 
@@ -306,7 +309,7 @@ public class PluginDescriptor {
     for (String id : pDescriptor.getDependencies()) {
       PluginDescriptor descriptor = PluginRepository.get(fConf)
           .getPluginDescriptor(id);
-      for (URL url: descriptor.getExportedLibUrls()) {
+      for (URL url : descriptor.getExportedLibUrls()) {
         pLibs.add(url);
       }
       collectLibs(pLibs, descriptor);

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java 
(original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java 
Fri Jan  9 06:34:33 2015
@@ -39,8 +39,8 @@ import org.w3c.dom.NodeList;
 import org.xml.sax.SAXException;
 
 /**
- * The <code>PluginManifestParser</code> parser just parse the manifest file
- * in all plugin directories.
+ * The <code>PluginManifestParser</code> parser just parse the manifest file in
+ * all plugin directories.
  * 
  * @author joa23
  */
@@ -93,7 +93,8 @@ public class PluginManifestParser {
             PluginDescriptor p = parseManifestFile(manifestPath);
             map.put(p.getPluginId(), p);
           } catch (Exception e) {
-            LOG.warn("Error while loading plugin `" + manifestPath + "` " + 
e.toString());
+            LOG.warn("Error while loading plugin `" + manifestPath + "` "
+                + e.toString());
           }
         }
       }
@@ -182,7 +183,7 @@ public class PluginManifestParser {
     PluginDescriptor pluginDescriptor = new PluginDescriptor(id, version, name,
         providerName, pluginClazz, pPath, this.conf);
     LOG.debug("plugin: id=" + id + " name=" + name + " version=" + version
-          + " provider=" + providerName + "class=" + pluginClazz);
+        + " provider=" + providerName + "class=" + pluginClazz);
     parseExtension(rootElement, pluginDescriptor);
     parseExtensionPoints(rootElement, pluginDescriptor);
     parseLibraries(rootElement, pluginDescriptor);
@@ -289,8 +290,8 @@ public class PluginManifestParser {
             if (parameters != null) {
               for (int k = 0; k < parameters.getLength(); k++) {
                 Element param = (Element) parameters.item(k);
-                extension.addAttribute(param.getAttribute(ATTR_NAME), param
-                    .getAttribute("value"));
+                extension.addAttribute(param.getAttribute(ATTR_NAME),
+                    param.getAttribute("value"));
               }
             }
             pPluginDescriptor.addExtension(extension);

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java 
Fri Jan  9 06:34:33 2015
@@ -50,13 +50,13 @@ public class PluginRepository {
   private HashMap<String, ExtensionPoint> fExtensionPoints;
 
   private HashMap<String, Plugin> fActivatedPlugins;
-  
-  private static final Map<String, Map<PluginClassLoader, Class>> CLASS_CACHE =
-    new HashMap<String, Map<PluginClassLoader,Class>>();
+
+  private static final Map<String, Map<PluginClassLoader, Class>> CLASS_CACHE 
= new HashMap<String, Map<PluginClassLoader, Class>>();
 
   private Configuration conf;
 
-  public static final Logger LOG = 
LoggerFactory.getLogger(PluginRepository.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(PluginRepository.class);
 
   /**
    * @throws PluginRuntimeException
@@ -68,7 +68,8 @@ public class PluginRepository {
     this.conf = new Configuration(conf);
     this.auto = conf.getBoolean("plugin.auto-activation", true);
     String[] pluginFolders = conf.getStrings("plugin.folders");
-    PluginManifestParser manifestParser = new PluginManifestParser(this.conf, 
this);
+    PluginManifestParser manifestParser = new PluginManifestParser(this.conf,
+        this);
     Map<String, PluginDescriptor> allPlugins = manifestParser
         .parsePluginFolder(pluginFolders);
     if (allPlugins.isEmpty()) {
@@ -85,7 +86,7 @@ public class PluginRepository {
     try {
       installExtensions(fRegisteredPlugins);
     } catch (PluginRuntimeException e) {
-        LOG.error(e.toString());
+      LOG.error(e.toString());
       throw new RuntimeException(e.getMessage());
     }
     displayStatus();
@@ -112,8 +113,8 @@ public class PluginRepository {
       return;
     }
 
-    for (PluginDescriptor plugin: plugins) {
-      for(ExtensionPoint point:plugin.getExtenstionPoints()) {
+    for (PluginDescriptor plugin : plugins) {
+      for (ExtensionPoint point : plugin.getExtenstionPoints()) {
         String xpId = point.getId();
         LOG.debug("Adding extension point " + xpId);
         fExtensionPoints.put(xpId, point);
@@ -128,7 +129,7 @@ public class PluginRepository {
       throws PluginRuntimeException {
 
     for (PluginDescriptor descriptor : pRegisteredPlugins) {
-      for(Extension extension:descriptor.getExtensions()) {
+      for (Extension extension : descriptor.getExtensions()) {
         String xpId = extension.getTargetPoint();
         ExtensionPoint point = getExtensionPoint(xpId);
         if (point == null) {
@@ -156,7 +157,7 @@ public class PluginRepository {
     branch.put(plugin.getPluginId(), plugin);
 
     // Otherwise, checks each dependency
-    for(String id:plugin.getDependencies()) {
+    for (String id : plugin.getDependencies()) {
       PluginDescriptor dependency = plugins.get(id);
       if (dependency == null) {
         throw new MissingDependencyException("Missing dependency " + id
@@ -271,7 +272,8 @@ public class PluginRepository {
       // The same is in Extension.getExtensionInstance().
       // Suggested by Stefan Groschupf <[email protected]>
       synchronized (pDescriptor) {
-        Class<?> pluginClass = getCachedClass(pDescriptor, 
pDescriptor.getPluginClass());
+        Class<?> pluginClass = getCachedClass(pDescriptor,
+            pDescriptor.getPluginClass());
         Constructor<?> constructor = pluginClass.getConstructor(new Class<?>[] 
{
             PluginDescriptor.class, Configuration.class });
         Plugin plugin = (Plugin) constructor.newInstance(new Object[] {
@@ -312,9 +314,9 @@ public class PluginRepository {
       plugin.shutDown();
     }
   }
-  
+
   public Class getCachedClass(PluginDescriptor pDescriptor, String className)
-  throws ClassNotFoundException {
+      throws ClassNotFoundException {
     Map<PluginClassLoader, Class> descMap = CLASS_CACHE.get(className);
     if (descMap == null) {
       descMap = new HashMap<PluginClassLoader, Class>();

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRuntimeException.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRuntimeException.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRuntimeException.java 
(original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRuntimeException.java 
Fri Jan  9 06:34:33 2015
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 package org.apache.nutch.plugin;
+
 /**
  * <code>PluginRuntimeException</code> will be thrown until a exception in the
  * plugin managemnt occurs.

Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java Fri Jan  
9 06:34:33 2015
@@ -41,7 +41,7 @@ import org.apache.nutch.metadata.Metadat
 import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.NutchConfiguration;
 
-public final class Content implements Writable{
+public final class Content implements Writable {
 
   public static final String DIR_NAME = "content";
 
@@ -85,7 +85,7 @@ public final class Content implements Wr
     this.mimeTypes = new MimeUtil(conf);
     this.contentType = getContentType(contentType, url, content);
   }
-  
+
   public Content(String url, String base, byte[] content, String contentType,
       Metadata metadata, MimeUtil mimeTypes) {
 
@@ -141,11 +141,11 @@ public final class Content implements Wr
       metadata.readFields(in); // read meta data
       break;
     default:
-      throw new VersionMismatchException((byte)2, oldVersion);
+      throw new VersionMismatchException((byte) 2, oldVersion);
     }
 
   }
-  
+
   public final void readFields(DataInput in) throws IOException {
     metadata.clear();
     int sizeOrVersion = in.readInt();
@@ -163,14 +163,14 @@ public final class Content implements Wr
         metadata.readFields(in);
         break;
       default:
-        throw new VersionMismatchException((byte)VERSION, (byte)version);
+        throw new VersionMismatchException((byte) VERSION, (byte) version);
       }
     } else { // size
       byte[] compressed = new byte[sizeOrVersion];
       in.readFully(compressed, 0, compressed.length);
       ByteArrayInputStream deflated = new ByteArrayInputStream(compressed);
-      DataInput inflater =
-        new DataInputStream(new InflaterInputStream(deflated));
+      DataInput inflater = new DataInputStream(
+          new InflaterInputStream(deflated));
       readFieldsCompressed(inflater);
     }
   }
@@ -204,8 +204,9 @@ public final class Content implements Wr
     return url;
   }
 
-  /** The base url for relative links contained in the content.
-   * Maybe be different from url if the request redirected.
+  /**
+   * The base url for relative links contained in the content. Maybe be
+   * different from url if the request redirected.
    */
   public String getBaseUrl() {
     return base;
@@ -220,7 +221,9 @@ public final class Content implements Wr
     this.content = content;
   }
 
-  /** The media type of the retrieved content.
+  /**
+   * The media type of the retrieved content.
+   * 
    * @see <a href="http://www.iana.org/assignments/media-types/";>
    *      http://www.iana.org/assignments/media-types/</a>
    */
@@ -276,9 +279,9 @@ public final class Content implements Wr
       System.out.println("usage:" + usage);
       return;
     }
-    
-    GenericOptionsParser optParser =
-      new GenericOptionsParser(NutchConfiguration.create(), args);
+
+    GenericOptionsParser optParser = new GenericOptionsParser(
+        NutchConfiguration.create(), args);
     String[] argv = optParser.getRemainingArgs();
     Configuration conf = optParser.getConfiguration();
 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java Fri Jan 
 9 06:34:33 2015
@@ -25,7 +25,7 @@ import org.apache.nutch.storage.WebPage;
 
 import crawlercommons.robots.BaseRobotRules;
 
-/** A retriever of url content.  Implemented by protocol extensions. */
+/** A retriever of url content. Implemented by protocol extensions. */
 public interface Protocol extends FieldPluggable, Configurable {
   /** The name of the extension point. */
   public final static String X_POINT_ID = Protocol.class.getName();
@@ -55,7 +55,9 @@ public interface Protocol extends FieldP
 
   /**
    * Retrieve robot rules applicable for this url.
-   * @param url url to check
+   * 
+   * @param url
+   *          url to check
    * @param page
    * @return robot rules (specific for this url or default), never null
    */

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolNotFound.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolNotFound.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolNotFound.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolNotFound.java 
Fri Jan  9 06:34:33 2015
@@ -22,7 +22,7 @@ public class ProtocolNotFound extends Pr
   private String url;
 
   public ProtocolNotFound(String url) {
-    this(url, "protocol not found for url="+url);
+    this(url, "protocol not found for url=" + url);
   }
 
   public ProtocolNotFound(String url, String message) {
@@ -30,5 +30,7 @@ public class ProtocolNotFound extends Pr
     this.url = url;
   }
 
-  public String getUrl() { return url; }
+  public String getUrl() {
+    return url;
+  }
 }

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolOutput.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolOutput.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolOutput.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolOutput.java 
Fri Jan  9 06:34:33 2015
@@ -17,10 +17,10 @@
 
 package org.apache.nutch.protocol;
 
-
 /**
- * Simple aggregate to pass from protocol plugins both content and
- * protocol status.
+ * Simple aggregate to pass from protocol plugins both content and protocol
+ * status.
+ * 
  * @author Andrzej Bialecki &lt;[email protected]&gt;
  */
 public class ProtocolOutput {

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusCodes.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusCodes.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusCodes.java 
(original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusCodes.java 
Fri Jan  9 06:34:33 2015
@@ -19,38 +19,42 @@ package org.apache.nutch.protocol;
 public interface ProtocolStatusCodes {
 
   /** Content was retrieved without errors. */
-  public static final int SUCCESS              = 1;
+  public static final int SUCCESS = 1;
   /** Content was not retrieved. Any further errors may be indicated in args. 
*/
-  public static final int FAILED               = 2;
+  public static final int FAILED = 2;
 
-  /** This protocol was not found.  Application may attempt to retry later. */
-  public static final int PROTO_NOT_FOUND      = 10;
+  /** This protocol was not found. Application may attempt to retry later. */
+  public static final int PROTO_NOT_FOUND = 10;
   /** Resource is gone. */
-  public static final int GONE                 = 11;
+  public static final int GONE = 11;
   /** Resource has moved permanently. New url should be found in args. */
-  public static final int MOVED                = 12;
+  public static final int MOVED = 12;
   /** Resource has moved temporarily. New url should be found in args. */
-  public static final int TEMP_MOVED           = 13;
+  public static final int TEMP_MOVED = 13;
   /** Resource was not found. */
-  public static final int NOTFOUND             = 14;
+  public static final int NOTFOUND = 14;
   /** Temporary failure. Application may retry immediately. */
-  public static final int RETRY                = 15;
-  /** Unspecified exception occured. Further information may be provided in 
args. */
-  public static final int EXCEPTION            = 16;
+  public static final int RETRY = 15;
+  /**
+   * Unspecified exception occured. Further information may be provided in 
args.
+   */
+  public static final int EXCEPTION = 16;
   /** Access denied - authorization required, but missing/incorrect. */
-  public static final int ACCESS_DENIED        = 17;
+  public static final int ACCESS_DENIED = 17;
   /** Access denied by robots.txt rules. */
-  public static final int ROBOTS_DENIED        = 18;
+  public static final int ROBOTS_DENIED = 18;
   /** Too many redirects. */
-  public static final int REDIR_EXCEEDED       = 19;
+  public static final int REDIR_EXCEEDED = 19;
   /** Not fetching. */
-  public static final int NOTFETCHING          = 20;
+  public static final int NOTFETCHING = 20;
   /** Unchanged since the last fetch. */
-  public static final int NOTMODIFIED          = 21;
-  /** Request was refused by protocol plugins, because it would block.
-   * The expected number of milliseconds to wait before retry may be provided
-   * in args. */
-  public static final int WOULDBLOCK           = 22;
+  public static final int NOTMODIFIED = 21;
+  /**
+   * Request was refused by protocol plugins, because it would block. The
+   * expected number of milliseconds to wait before retry may be provided in
+   * args.
+   */
+  public static final int WOULDBLOCK = 22;
   /** Thread was blocked http.max.delays times during fetching. */
-  public static final int BLOCKED              = 23;
+  public static final int BLOCKED = 23;
 }

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java 
(original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java 
Fri Jan  9 06:34:33 2015
@@ -100,7 +100,7 @@ public class ProtocolStatusUtils impleme
     }
     return TableUtil.toString(args.iterator().next());
   }
-  
+
   public static String toString(ProtocolStatus status) {
     if (status == null) {
       return "(null)";
@@ -113,7 +113,8 @@ public class ProtocolStatusUtils impleme
       int i = 0;
       Iterator<CharSequence> it = args.iterator();
       while (it.hasNext()) {
-        if (i > 0) sb.append(',');
+        if (i > 0)
+          sb.append(',');
         sb.append(it.next());
         i++;
       }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRules.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRules.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRules.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRules.java Fri 
Jan  9 06:34:33 2015
@@ -35,9 +35,8 @@ public interface RobotRules {
   public long getCrawlDelay();
 
   /**
-   * Returns <code>false</code> if the <code>robots.txt</code> file
-   * prohibits us from accessing the given <code>url</code>, or
-   * <code>true</code> otherwise.
+   * Returns <code>false</code> if the <code>robots.txt</code> file prohibits 
us
+   * from accessing the given <code>url</code>, or <code>true</code> otherwise.
    */
   public boolean isAllowed(URL url);
 

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java 
Fri Jan  9 06:34:33 2015
@@ -43,35 +43,38 @@ import crawlercommons.robots.SimpleRobot
 import crawlercommons.robots.SimpleRobotRulesParser;
 
 /**
- * This class uses crawler-commons for handling the parsing of {@code 
robots.txt} files.
- * It emits SimpleRobotRules objects, which describe the download permissions
- * as described in SimpleRobotRulesParser.
+ * This class uses crawler-commons for handling the parsing of
+ * {@code robots.txt} files. It emits SimpleRobotRules objects, which describe
+ * the download permissions as described in SimpleRobotRulesParser.
  */
 public abstract class RobotRulesParser implements Configurable {
 
-  public static final Logger LOG = 
LoggerFactory.getLogger(RobotRulesParser.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(RobotRulesParser.class);
 
-  protected static final Hashtable<String, BaseRobotRules> CACHE = new 
Hashtable<String, BaseRobotRules> ();
+  protected static final Hashtable<String, BaseRobotRules> CACHE = new 
Hashtable<String, BaseRobotRules>();
 
   /**
-   *  A {@link BaseRobotRules} object appropriate for use
-   *  when the {@code robots.txt} file is empty or missing;
-   *  all requests are allowed.
+   * A {@link BaseRobotRules} object appropriate for use when the
+   * {@code robots.txt} file is empty or missing; all requests are allowed.
    */
-  public static final BaseRobotRules EMPTY_RULES = new 
SimpleRobotRules(RobotRulesMode.ALLOW_ALL);
+  public static final BaseRobotRules EMPTY_RULES = new SimpleRobotRules(
+      RobotRulesMode.ALLOW_ALL);
 
   /**
-   *  A {@link BaseRobotRules} object appropriate for use when the 
-   *  {@code robots.txt} file is not fetched due to a {@code 403/Forbidden}
-   *  response; all requests are disallowed. 
+   * A {@link BaseRobotRules} object appropriate for use when the
+   * {@code robots.txt} file is not fetched due to a {@code 403/Forbidden}
+   * response; all requests are disallowed.
    */
-  public static BaseRobotRules FORBID_ALL_RULES = new 
SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
+  public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules(
+      RobotRulesMode.ALLOW_NONE);
 
   private static SimpleRobotRulesParser robotParser = new 
SimpleRobotRulesParser();
   private Configuration conf;
   protected String agentNames;
 
-  public RobotRulesParser() { }
+  public RobotRulesParser() {
+  }
 
   public RobotRulesParser(Configuration conf) {
     setConf(conf);
@@ -90,16 +93,18 @@ public abstract class RobotRulesParser i
     }
     agentNames = agentName;
 
-    // If there are any other agents specified, append those to the list of 
agents
+    // If there are any other agents specified, append those to the list of
+    // agents
     String otherAgents = conf.get("http.robots.agents");
-    if(otherAgents != null && !otherAgents.trim().isEmpty()) {
+    if (otherAgents != null && !otherAgents.trim().isEmpty()) {
       StringTokenizer tok = new StringTokenizer(otherAgents, ",");
       StringBuilder sb = new StringBuilder(agentNames);
       while (tok.hasMoreTokens()) {
         String str = tok.nextToken().trim();
         if (str.equals("*") || str.equals(agentName)) {
           // skip wildcard "*" or agent name itself
-          // (required for backward compatibility, cf. NUTCH-1715 and 
NUTCH-1718)
+          // (required for backward compatibility, cf. NUTCH-1715 and
+          // NUTCH-1718)
         } else {
           sb.append(",").append(str);
         }
@@ -117,16 +122,23 @@ public abstract class RobotRulesParser i
   }
 
   /**
-   * Parses the robots content using the {@link SimpleRobotRulesParser} from 
crawler commons
-   *    
-   * @param url A string containing url
-   * @param content Contents of the robots file in a byte array 
-   * @param contentType The content type of the robots file
-   * @param robotName A string containing all the robots agent names used by 
parser for matching
-   * @return BaseRobotRules object 
+   * Parses the robots content using the {@link SimpleRobotRulesParser} from
+   * crawler commons
+   * 
+   * @param url
+   *          A string containing url
+   * @param content
+   *          Contents of the robots file in a byte array
+   * @param contentType
+   *          The content type of the robots file
+   * @param robotName
+   *          A string containing all the robots agent names used by parser for
+   *          matching
+   * @return BaseRobotRules object
    */
-  public BaseRobotRules parseRules (String url, byte[] content, String 
contentType, String robotName) {
-    return robotParser.parseContent(url, content, contentType, robotName); 
+  public BaseRobotRules parseRules(String url, byte[] content,
+      String contentType, String robotName) {
+    return robotParser.parseContent(url, content, contentType, robotName);
   }
 
   public BaseRobotRules getRobotRulesSet(Protocol protocol, String url) {
@@ -145,23 +157,29 @@ public abstract class RobotRulesParser i
   public static void main(String[] argv) {
 
     if (argv.length != 3) {
-      System.err.println("Usage: RobotRulesParser <robots-file> <url-file> 
<agent-names>\n");
-      System.err.println("    <robots-file> - Input robots.txt file which will 
be parsed.");
-      System.err.println("    <url-file>    - Contains input URLs (1 per line) 
which are tested against the rules.");
-      System.err.println("    <agent-names> - Input agent names. Multiple 
agent names can be provided using");
-      System.err.println("                    comma as a delimiter without any 
spaces.");
+      System.err
+          .println("Usage: RobotRulesParser <robots-file> <url-file> 
<agent-names>\n");
+      System.err
+          .println("    <robots-file> - Input robots.txt file which will be 
parsed.");
+      System.err
+          .println("    <url-file>    - Contains input URLs (1 per line) which 
are tested against the rules.");
+      System.err
+          .println("    <agent-names> - Input agent names. Multiple agent 
names can be provided using");
+      System.err
+          .println("                    comma as a delimiter without any 
spaces.");
       System.exit(-1);
     }
 
     try {
       byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
-      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, 
"text/plain", argv[2]);
+      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes,
+          "text/plain", argv[2]);
 
       LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
       String testPath = testsIn.readLine().trim();
       while (testPath != null) {
-        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not 
allowed") +
-            ":\t" + testPath);
+        System.out.println((rules.isAllowed(testPath) ? "allowed"
+            : "not allowed") + ":\t" + testPath);
         testPath = testsIn.readLine();
       }
       testsIn.close();

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/package-info.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/package-info.java Fri 
Jan  9 06:34:33 2015
@@ -20,3 +20,4 @@
  * see also {@link org.apache.nutch.net.protocols}.
  */
 package org.apache.nutch.protocol;
+

svn commit: r1650447 [5/25] - in /nutch/branches/2.x: ./ src/java/org/apache/nutch/api/ src/java/org/apache/nutch/api/impl/ src/java/org/apache/nutch/api/impl/db/ src/java/org/apache/nutch/api/model/response/ src/java/org/apache/nutch/api/resources/ sr...

Reply via email to