p...

lewismc Wed, 28 Jan 2015 21:39:40 -0800

Modified: 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Thu Jan 29 05:38:59 2015
@@ -47,20 +47,20 @@ import org.apache.hadoop.io.Text;
 import crawlercommons.robots.BaseRobotRules;
 
 public abstract class HttpBase implements Protocol {
-  
+
   public static final Text RESPONSE_TIME = new Text("_rs_");
 
   public static final int BUFFER_SIZE = 8 * 1024;
-  
+
   private static final byte[] EMPTY_CONTENT = new byte[0];
 
   private HttpRobotRulesParser robots = null;
- 
-  /** The proxy hostname. */ 
+
+  /** The proxy hostname. */
   protected String proxyHost = null;
 
   /** The proxy port. */
-  protected int proxyPort = 8080; 
+  protected int proxyPort = 8080;
 
   /** Indicates if a proxy is used */
   protected boolean useProxy = false;
@@ -69,29 +69,27 @@ public abstract class HttpBase implement
   protected int timeout = 10000;
 
   /** The length limit for downloaded content, in bytes. */
-  protected int maxContent = 64 * 1024; 
+  protected int maxContent = 64 * 1024;
 
   /** The Nutch 'User-Agent' request header */
-  protected String userAgent = getAgentString(
-                        "NutchCVS", null, "Nutch",
-                        "http://nutch.apache.org/bot.html";,
-                        "[email protected]");
+  protected String userAgent = getAgentString("NutchCVS", null, "Nutch",
+      "http://nutch.apache.org/bot.html";, "[email protected]");
 
   /** The "Accept-Language" request header value. */
   protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
 
   /** The "Accept" request header value. */
   protected String accept = 
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
-  
+
   /** The default logger */
   private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class);
 
   /** The specified logger */
   private Logger logger = LOGGER;
- 
+
   /** The nutch configuration */
   private Configuration conf = null;
-  
+
   /** Do we use HTTP/1.1? */
   protected boolean useHttp11 = false;
 
@@ -99,14 +97,14 @@ public abstract class HttpBase implement
    * Record response time in CrawlDatum's meta data, see property
    * http.store.responsetime.
    */
-  protected boolean responseTime = true;    
-  
+  protected boolean responseTime = true;
+
   /** Skip page if Crawl-Delay longer than this value. */
   protected long maxCrawlDelay = -1L;
-  
+
   /** Which TLS/SSL protocols to support */
   protected Set<String> tlsPreferredProtocols;
-  
+
   /** Which TLS/SSL cipher suites to support */
   protected Set<String> tlsPreferredCipherSuites;
 
@@ -114,7 +112,7 @@ public abstract class HttpBase implement
   public HttpBase() {
     this(null);
   }
-  
+
   /** Creates a new instance of HttpBase */
   public HttpBase(Logger logger) {
     if (logger != null) {
@@ -122,134 +120,168 @@ public abstract class HttpBase implement
     }
     robots = new HttpRobotRulesParser();
   }
-  
+
   // Inherited Javadoc
   public void setConf(Configuration conf) {
-      this.conf = conf;
-      this.proxyHost = conf.get("http.proxy.host");
-      this.proxyPort = conf.getInt("http.proxy.port", 8080);
-      this.useProxy = (proxyHost != null && proxyHost.length() > 0);
-      this.timeout = conf.getInt("http.timeout", 10000);
-      this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
-      this.userAgent = getAgentString(conf.get("http.agent.name"), 
conf.get("http.agent.version"), conf
-              .get("http.agent.description"), conf.get("http.agent.url"), 
conf.get("http.agent.email"));
-      this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
-      this.accept = conf.get("http.accept", accept);
-      // backward-compatible default setting
-      this.useHttp11 = conf.getBoolean("http.useHttp11", false);
-      this.responseTime = conf.getBoolean("http.store.responsetime", true);
-      this.robots.setConf(conf);
-      
-      String[] protocols = conf.getStrings("http.tls.supported.protocols", 
"TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
-      String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", 
-          
"TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
-          
"TLS_RSA_WITH_AES_256_CBC_SHA256","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
-          
"TLS_DHE_RSA_WITH_AES_256_CBC_SHA256","TLS_DHE_DSS_WITH_AES_256_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
-          
"TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA","TLS_RSA_WITH_AES_256_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
-          
"TLS_ECDH_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
-          
"TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256","TLS_RSA_WITH_AES_128_CBC_SHA256",
-          
"TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256","TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
-          
"TLS_DHE_DSS_WITH_AES_128_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA",
-          
"TLS_RSA_WITH_AES_128_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
-          
"TLS_DHE_RSA_WITH_AES_128_CBC_SHA","TLS_DHE_DSS_WITH_AES_128_CBC_SHA","TLS_ECDHE_ECDSA_WITH_RC4_128_SHA",
-          
"TLS_ECDHE_RSA_WITH_RC4_128_SHA","SSL_RSA_WITH_RC4_128_SHA","TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
-          
"TLS_ECDH_RSA_WITH_RC4_128_SHA","TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA",
-          
"SSL_RSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
-          
"SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA","SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA","SSL_RSA_WITH_RC4_128_MD5",
-          
"TLS_EMPTY_RENEGOTIATION_INFO_SCSV","TLS_RSA_WITH_NULL_SHA256","TLS_ECDHE_ECDSA_WITH_NULL_SHA",
-          
"TLS_ECDHE_RSA_WITH_NULL_SHA","SSL_RSA_WITH_NULL_SHA","TLS_ECDH_ECDSA_WITH_NULL_SHA","TLS_ECDH_RSA_WITH_NULL_SHA",
-          
"SSL_RSA_WITH_NULL_MD5","SSL_RSA_WITH_DES_CBC_SHA","SSL_DHE_RSA_WITH_DES_CBC_SHA","SSL_DHE_DSS_WITH_DES_CBC_SHA",
-          
"TLS_KRB5_WITH_RC4_128_SHA","TLS_KRB5_WITH_RC4_128_MD5","TLS_KRB5_WITH_3DES_EDE_CBC_SHA","TLS_KRB5_WITH_3DES_EDE_CBC_MD5",
-          "TLS_KRB5_WITH_DES_CBC_SHA","TLS_KRB5_WITH_DES_CBC_MD5");
+    this.conf = conf;
+    this.proxyHost = conf.get("http.proxy.host");
+    this.proxyPort = conf.getInt("http.proxy.port", 8080);
+    this.useProxy = (proxyHost != null && proxyHost.length() > 0);
+    this.timeout = conf.getInt("http.timeout", 10000);
+    this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
+    this.userAgent = getAgentString(conf.get("http.agent.name"),
+        conf.get("http.agent.version"), conf.get("http.agent.description"),
+        conf.get("http.agent.url"), conf.get("http.agent.email"));
+    this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
+    this.accept = conf.get("http.accept", accept);
+    // backward-compatible default setting
+    this.useHttp11 = conf.getBoolean("http.useHttp11", false);
+    this.responseTime = conf.getBoolean("http.store.responsetime", true);
+    this.robots.setConf(conf);
+
+    String[] protocols = conf.getStrings("http.tls.supported.protocols",
+        "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
+    String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
+        "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384",
+        "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
+        "TLS_RSA_WITH_AES_256_CBC_SHA256",
+        "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384",
+        "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
+        "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256",
+        "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256",
+        "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
+        "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA",
+        "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
+        "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA",
+        "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
+        "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
+        "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA",
+        "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA",
+        "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
+        "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA",
+        "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA",
+        "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
+        "TLS_ECDH_RSA_WITH_RC4_128_SHA",
+        "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA",
+        "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA",
+        "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA",
+        "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
+        "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA",
+        "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5",
+        "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256",
+        "TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA",
+        "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA",
+        "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5",
+        "SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA",
+        "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA",
+        "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA",
+        "TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA",
+        "TLS_KRB5_WITH_DES_CBC_MD5");
 
-      tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
-      tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
+    tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
+    tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
 
-      logConf();
+    logConf();
   }
 
   // Inherited Javadoc
   public Configuration getConf() {
     return this.conf;
   }
-  
+
   public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
-    
+
     String urlString = url.toString();
     try {
       URL u = new URL(urlString);
-      
+
       long startTime = System.currentTimeMillis();
       Response response = getResponse(u, datum, false); // make a request
-      
-      if(this.responseTime) {
+
+      if (this.responseTime) {
         int elapsedTime = (int) (System.currentTimeMillis() - startTime);
         datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
       }
-      
+
       int code = response.getCode();
       byte[] content = response.getContent();
       Content c = new Content(u.toString(), u.toString(),
-                              (content == null ? EMPTY_CONTENT : content),
-                              response.getHeader("Content-Type"),
-                              response.getHeaders(), this.conf);
-      
+          (content == null ? EMPTY_CONTENT : content),
+          response.getHeader("Content-Type"), response.getHeaders(), 
this.conf);
+
       if (code == 200) { // got a good response
         return new ProtocolOutput(c); // return it
-        
+
       } else if (code >= 300 && code < 400) { // handle redirect
         String location = response.getHeader("Location");
         // some broken servers, such as MS IIS, use lowercase header name...
-        if (location == null) location = response.getHeader("location");
-        if (location == null) location = "";
+        if (location == null)
+          location = response.getHeader("location");
+        if (location == null)
+          location = "";
         u = new URL(u, location);
         int protocolStatusCode;
         switch (code) {
-          case 300:   // multiple choices, preferred value in Location
-            protocolStatusCode = ProtocolStatus.MOVED;
-            break;
-          case 301:   // moved permanently
-          case 305:   // use proxy (Location is URL of proxy)
-            protocolStatusCode = ProtocolStatus.MOVED;
-            break;
-          case 302:   // found (temporarily moved)
-          case 303:   // see other (redirect after POST)
-          case 307:   // temporary redirect
-            protocolStatusCode = ProtocolStatus.TEMP_MOVED;
-            break;
-          case 304:   // not modified
-            protocolStatusCode = ProtocolStatus.NOTMODIFIED;
-            break;
-          default:
-            protocolStatusCode = ProtocolStatus.MOVED;
+        case 300: // multiple choices, preferred value in Location
+          protocolStatusCode = ProtocolStatus.MOVED;
+          break;
+        case 301: // moved permanently
+        case 305: // use proxy (Location is URL of proxy)
+          protocolStatusCode = ProtocolStatus.MOVED;
+          break;
+        case 302: // found (temporarily moved)
+        case 303: // see other (redirect after POST)
+        case 307: // temporary redirect
+          protocolStatusCode = ProtocolStatus.TEMP_MOVED;
+          break;
+        case 304: // not modified
+          protocolStatusCode = ProtocolStatus.NOTMODIFIED;
+          break;
+        default:
+          protocolStatusCode = ProtocolStatus.MOVED;
         }
         // handle this in the higher layer.
         return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, 
u));
       } else if (code == 400) { // bad request, mark as GONE
-        if (logger.isTraceEnabled()) { logger.trace("400 Bad request: " + u); }
+        if (logger.isTraceEnabled()) {
+          logger.trace("400 Bad request: " + u);
+        }
         return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, 
u));
-      } else if (code == 401) { // requires authorization, but no valid auth 
provided.
-        if (logger.isTraceEnabled()) { logger.trace("401 Authentication 
Required"); }
-        return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+      } else if (code == 401) { // requires authorization, but no valid auth
+                                // provided.
+        if (logger.isTraceEnabled()) {
+          logger.trace("401 Authentication Required");
+        }
+        return new ProtocolOutput(c, new ProtocolStatus(
+            ProtocolStatus.ACCESS_DENIED, "Authentication required: "
                 + urlString));
       } else if (code == 404) {
-        return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.NOTFOUND, u));
+        return new ProtocolOutput(c, new ProtocolStatus(
+            ProtocolStatus.NOTFOUND, u));
       } else if (code == 410) { // permanently GONE
-        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, 
"Http: " + code + " url=" + u));
+        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
+            "Http: " + code + " url=" + u));
       } else {
-        return new ProtocolOutput(c, new 
ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
-                + u));
+        return new ProtocolOutput(c, new ProtocolStatus(
+            ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
       }
     } catch (Throwable e) {
       logger.error("Failed to get protocol output", e);
       return new ProtocolOutput(null, new ProtocolStatus(e));
     }
   }
-  
-  /* -------------------------- *
-   * </implementation:Protocol> *
-   * -------------------------- */
 
+  /*
+   * -------------------------- * </implementation:Protocol> *
+   * --------------------------
+   */
 
   public String getProxyHost() {
     return proxyHost;
@@ -274,70 +306,69 @@ public abstract class HttpBase implement
   public String getUserAgent() {
     return userAgent;
   }
-  
-  /** Value of "Accept-Language" request header sent by Nutch.
+
+  /**
+   * Value of "Accept-Language" request header sent by Nutch.
+   * 
    * @return The value of the header "Accept-Language" header.
    */
   public String getAcceptLanguage() {
-         return acceptLanguage;
+    return acceptLanguage;
   }
 
   public String getAccept() {
-         return accept;
+    return accept;
   }
 
   public boolean getUseHttp11() {
     return useHttp11;
   }
-  
+
   public Set<String> getTlsPreferredCipherSuites() {
     return tlsPreferredCipherSuites;
   }
-  
+
   public Set<String> getTlsPreferredProtocols() {
     return tlsPreferredProtocols;
   }
 
-  private static String getAgentString(String agentName,
-                                       String agentVersion,
-                                       String agentDesc,
-                                       String agentURL,
-                                       String agentEmail) {
-    
-    if ( (agentName == null) || (agentName.trim().length() == 0) ) {
+  private static String getAgentString(String agentName, String agentVersion,
+      String agentDesc, String agentURL, String agentEmail) {
+
+    if ((agentName == null) || (agentName.trim().length() == 0)) {
       // TODO : NUTCH-258
       if (LOGGER.isErrorEnabled()) {
         LOGGER.error("No User-Agent string set (http.agent.name)!");
       }
     }
-    
-    StringBuffer buf= new StringBuffer();
-    
+
+    StringBuffer buf = new StringBuffer();
+
     buf.append(agentName);
     if (agentVersion != null) {
       buf.append("/");
       buf.append(agentVersion);
     }
-    if ( ((agentDesc != null) && (agentDesc.length() != 0))
-    || ((agentEmail != null) && (agentEmail.length() != 0))
-    || ((agentURL != null) && (agentURL.length() != 0)) ) {
+    if (((agentDesc != null) && (agentDesc.length() != 0))
+        || ((agentEmail != null) && (agentEmail.length() != 0))
+        || ((agentURL != null) && (agentURL.length() != 0))) {
       buf.append(" (");
-      
+
       if ((agentDesc != null) && (agentDesc.length() != 0)) {
         buf.append(agentDesc);
-        if ( (agentURL != null) || (agentEmail != null) )
+        if ((agentURL != null) || (agentEmail != null))
           buf.append("; ");
       }
-      
+
       if ((agentURL != null) && (agentURL.length() != 0)) {
         buf.append(agentURL);
         if (agentEmail != null)
           buf.append("; ");
       }
-      
+
       if ((agentEmail != null) && (agentEmail.length() != 0))
         buf.append(agentEmail);
-      
+
       buf.append(")");
     }
     return buf.toString();
@@ -354,52 +385,59 @@ public abstract class HttpBase implement
       logger.info("http.accept = " + accept);
     }
   }
-  
-  public byte[] processGzipEncoded(byte[] compressed, URL url) throws 
IOException {
 
-    if (LOGGER.isTraceEnabled()) { LOGGER.trace("uncompressing...."); }
+  public byte[] processGzipEncoded(byte[] compressed, URL url)
+      throws IOException {
+
+    if (LOGGER.isTraceEnabled()) {
+      LOGGER.trace("uncompressing....");
+    }
 
     // content can be empty (i.e. redirection) in which case
     // there is nothing to unzip
     if (compressed.length == 0)
       return compressed;
-    
+
     byte[] content;
     if (getMaxContent() >= 0) {
-        content = GZIPUtils.unzipBestEffort(compressed, getMaxContent());
+      content = GZIPUtils.unzipBestEffort(compressed, getMaxContent());
     } else {
-        content = GZIPUtils.unzipBestEffort(compressed);
-    } 
+      content = GZIPUtils.unzipBestEffort(compressed);
+    }
 
     if (content == null)
       throw new IOException("unzipBestEffort returned null");
 
     if (LOGGER.isTraceEnabled()) {
       LOGGER.trace("fetched " + compressed.length
-                 + " bytes of compressed content (expanded to "
-                 + content.length + " bytes) from " + url);
+          + " bytes of compressed content (expanded to " + content.length
+          + " bytes) from " + url);
     }
     return content;
   }
 
-  public byte[] processDeflateEncoded(byte[] compressed, URL url) throws 
IOException {
+  public byte[] processDeflateEncoded(byte[] compressed, URL url)
+      throws IOException {
 
     // content can be empty (i.e. redirection) in which case
     // there is nothing to deflate
     if (compressed.length == 0)
       return compressed;
-    
-    if (LOGGER.isTraceEnabled()) { LOGGER.trace("inflating...."); }
 
-    byte[] content = DeflateUtils.inflateBestEffort(compressed, 
getMaxContent());
+    if (LOGGER.isTraceEnabled()) {
+      LOGGER.trace("inflating....");
+    }
+
+    byte[] content = DeflateUtils
+        .inflateBestEffort(compressed, getMaxContent());
 
     if (content == null)
       throw new IOException("inflateBestEffort returned null");
 
     if (LOGGER.isTraceEnabled()) {
       LOGGER.trace("fetched " + compressed.length
-                 + " bytes of compressed content (expanded to "
-                 + content.length + " bytes) from " + url);
+          + " bytes of compressed content (expanded to " + content.length
+          + " bytes) from " + url);
     }
     return content;
   }
@@ -407,14 +445,14 @@ public abstract class HttpBase implement
   protected static void main(HttpBase http, String[] args) throws Exception {
     boolean verbose = false;
     String url = null;
-    
+
     String usage = "Usage: Http [-verbose] [-timeout N] url";
-    
+
     if (args.length == 0) {
       System.err.println(usage);
       System.exit(-1);
     }
-    
+
     for (int i = 0; i < args.length; i++) { // parse command line
       if (args[i].equals("-timeout")) { // found -timeout option
         http.timeout = Integer.parseInt(args[++i]) * 1000;
@@ -423,35 +461,34 @@ public abstract class HttpBase implement
       } else if (i != args.length - 1) {
         System.err.println(usage);
         System.exit(-1);
-      } else // root is required parameter
+      } else
+        // root is required parameter
         url = args[i];
     }
-    
-//    if (verbose) {
-//      LOGGER.setLevel(Level.FINE);
-//    }
-    
-    ProtocolOutput out = http.getProtocolOutput(new Text(url), new 
CrawlDatum());
+
+    // if (verbose) {
+    // LOGGER.setLevel(Level.FINE);
+    // }
+
+    ProtocolOutput out = http
+        .getProtocolOutput(new Text(url), new CrawlDatum());
     Content content = out.getContent();
-    
+
     System.out.println("Status: " + out.getStatus());
     if (content != null) {
       System.out.println("Content Type: " + content.getContentType());
-      System.out.println("Content Length: " +
-                         content.getMetadata().get(Response.CONTENT_LENGTH));
+      System.out.println("Content Length: "
+          + content.getMetadata().get(Response.CONTENT_LENGTH));
       System.out.println("Content:");
       String text = new String(content.getContent());
       System.out.println(text);
-    }  
+    }
   }
-  
-  protected abstract Response getResponse(URL url,
-                                          CrawlDatum datum,
-                                          boolean followRedirects)
-    throws ProtocolException, IOException;
+
+  protected abstract Response getResponse(URL url, CrawlDatum datum,
+      boolean followRedirects) throws ProtocolException, IOException;
 
   public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
     return robots.getRobotRulesSet(this, url);
   }
 }
-


Modified: 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java
 Thu Jan 29 05:38:59 2015
@@ -19,7 +19,6 @@ package org.apache.nutch.protocol.http.a
 // Nutch imports
 import org.apache.nutch.protocol.ProtocolException;
 
-
 public class HttpException extends ProtocolException {
 
   public HttpException() {

Modified: 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
 Thu Jan 29 05:38:59 2015
@@ -32,36 +32,41 @@ import crawlercommons.robots.BaseRobotRu
 import crawlercommons.robots.SimpleRobotRules;
 
 /**
- * This class is used for parsing robots for urls belonging to HTTP protocol.
- * It extends the generic {@link RobotRulesParser} class and contains 
- * Http protocol specific implementation for obtaining the robots file.
+ * This class is used for parsing robots for urls belonging to HTTP protocol. 
It
+ * extends the generic {@link RobotRulesParser} class and contains Http 
protocol
+ * specific implementation for obtaining the robots file.
  */
 public class HttpRobotRulesParser extends RobotRulesParser {
-  
-  public static final Logger LOG = 
LoggerFactory.getLogger(HttpRobotRulesParser.class);
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(HttpRobotRulesParser.class);
   protected boolean allowForbidden = false;
 
-  HttpRobotRulesParser() { }
+  HttpRobotRulesParser() {
+  }
 
   public HttpRobotRulesParser(Configuration conf) {
-           setConf(conf);
+    setConf(conf);
   }
- 
+
   public void setConf(Configuration conf) {
-           super.setConf(conf);
-           allowForbidden = conf.getBoolean("http.robots.403.allow", true);
+    super.setConf(conf);
+    allowForbidden = conf.getBoolean("http.robots.403.allow", true);
   }
 
   /** Compose unique key to store and access robot rules in cache for given 
URL */
   protected static String getCacheKey(URL url) {
-    String protocol = url.getProtocol().toLowerCase();  // normalize to lower 
case
-    String host = url.getHost().toLowerCase();          // normalize to lower 
case
+    String protocol = url.getProtocol().toLowerCase(); // normalize to lower
+                                                       // case
+    String host = url.getHost().toLowerCase(); // normalize to lower case
     int port = url.getPort();
     if (port == -1) {
       port = url.getDefaultPort();
     }
-   /* Robot rules apply only to host, protocol, and port where robots.txt is
-    * hosted (cf. NUTCH-1752). Consequently  */
+    /*
+     * Robot rules apply only to host, protocol, and port where robots.txt is
+     * hosted (cf. NUTCH-1752). Consequently
+     */
     String cacheKey = protocol + ":" + host + ":" + port;
     return cacheKey;
   }
@@ -77,7 +82,7 @@ public class HttpRobotRulesParser extend
    *          The {@link Protocol} object
    * @param url
    *          URL robots.txt applies to
-   *
+   * 
    * @return {@link BaseRobotRules} holding the rules from robots.txt
    */
   public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
@@ -86,13 +91,15 @@ public class HttpRobotRulesParser extend
     BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey);
 
     boolean cacheRule = true;
-    
-    if (robotRules == null) {                     // cache miss
+
+    if (robotRules == null) { // cache miss
       URL redir = null;
-      if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("cache miss " + url);
+      }
       try {
-        Response response = ((HttpBase)http).getResponse(new URL(url, 
"/robots.txt"),
-                                             new CrawlDatum(), true);
+        Response response = ((HttpBase) http).getResponse(new URL(url,
+            "/robots.txt"), new CrawlDatum(), true);
         // try one level of redirection ?
         if (response.getCode() == 301 || response.getCode() == 302) {
           String redirection = response.getHeader("Location");
@@ -107,23 +114,23 @@ public class HttpRobotRulesParser extend
             } else {
               redir = new URL(redirection);
             }
-            
-            response = ((HttpBase)http).getResponse(redir, new CrawlDatum(), 
true);
+
+            response = ((HttpBase) http).getResponse(redir, new CrawlDatum(),
+                true);
           }
         }
 
-        if (response.getCode() == 200)               // found rules: parse them
-          robotRules =  parseRules(url.toString(), response.getContent(), 
-                                   response.getHeader("Content-Type"), 
-                                   agentNames);
+        if (response.getCode() == 200) // found rules: parse them
+          robotRules = parseRules(url.toString(), response.getContent(),
+              response.getHeader("Content-Type"), agentNames);
 
-        else if ( (response.getCode() == 403) && (!allowForbidden) )
-          robotRules = FORBID_ALL_RULES;            // use forbid all
+        else if ((response.getCode() == 403) && (!allowForbidden))
+          robotRules = FORBID_ALL_RULES; // use forbid all
         else if (response.getCode() >= 500) {
           cacheRule = false;
           robotRules = EMPTY_RULES;
-        }else                                        
-          robotRules = EMPTY_RULES;                 // use default rules
+        } else
+          robotRules = EMPTY_RULES; // use default rules
       } catch (Throwable t) {
         if (LOG.isInfoEnabled()) {
           LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
@@ -133,7 +140,7 @@ public class HttpRobotRulesParser extend
       }
 
       if (cacheRule) {
-        CACHE.put(cacheKey, robotRules);  // cache rules for host
+        CACHE.put(cacheKey, robotRules); // cache rules for host
         if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) 
{
           // cache also for the redirected host
           CACHE.put(getCacheKey(redir), robotRules);

Modified: 
nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
 Thu Jan 29 05:38:59 2015
@@ -23,10 +23,10 @@ import org.junit.Test;
 import crawlercommons.robots.BaseRobotRules;
 
 /**
- * JUnit test case which tests
- * 1. that robots filtering is performed correctly as per the agent name
- * 2. that crawl delay is extracted correctly from the robots file
- *
+ * JUnit test case which tests 1. that robots filtering is performed correctly
+ * as per the agent name 2. that crawl delay is extracted correctly from the
+ * robots file
+ * 
  */
 public class TestRobotRulesParser {
 
@@ -36,39 +36,32 @@ public class TestRobotRulesParser {
   private static final String UNKNOWN_AGENT = "AgentABC";
   private static final String CR = "\r";
 
-  private static final String ROBOTS_STRING = 
-      "User-Agent: Agent1 #foo" + CR 
-      + "Disallow: /a" + CR 
-      + "Disallow: /b/a" + CR 
-      + "#Disallow: /c" + CR 
-      + "Crawl-delay: 10" + CR  // set crawl delay for Agent1 as 10 sec
-      + "" + CR 
-      + "" + CR 
-      + "User-Agent: Agent2" + CR 
-      + "Disallow: /a/bloh" + CR 
-      + "Disallow: /c" + CR
-      + "Disallow: /foo" + CR
-      + "Crawl-delay: 20" + CR
-      + "" + CR 
-      + "User-Agent: *" + CR 
-      + "Disallow: /foo/bar/" + CR;   // no crawl delay for other agents
+  private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR
+      + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c"
+      + CR
+      + "Crawl-delay: 10"
+      + CR // set crawl delay for Agent1 as 10 sec
+      + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh"
+      + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20"
+      + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no
+                                                                          // 
crawl
+                                                                          // 
delay
+                                                                          // 
for
+                                                                          // 
other
+                                                                          // 
agents
 
   private static final String[] TEST_PATHS = new String[] {
-    "http://example.com/a";,
-    "http://example.com/a/bloh/foo.html";,
-    "http://example.com/b";,
-    "http://example.com/c";,
-    "http://example.com/b/a/index.html";,
-    "http://example.com/foo/bar/baz.html";
-  };
-
-  private static final boolean[] RESULTS = new boolean[] {
-    false,  //  /a
-    false,  //  /a/bloh/foo.html
-    true,   //  /b
-    true,   //  /c
-    false,  //  /b/a/index.html
-    true    //  /foo/bar/baz.html
+      "http://example.com/a";, "http://example.com/a/bloh/foo.html";,
+      "http://example.com/b";, "http://example.com/c";,
+      "http://example.com/b/a/index.html";,
+      "http://example.com/foo/bar/baz.html"; };
+
+  private static final boolean[] RESULTS = new boolean[] { false, // /a
+      false, // /a/bloh/foo.html
+      true, // /b
+      true, // /c
+      false, // /b/a/index.html
+      true // /foo/bar/baz.html
   };
 
   private HttpRobotRulesParser parser;
@@ -79,41 +72,52 @@ public class TestRobotRulesParser {
   }
 
   /**
-   * Test that the robots rules are interpreted correctly by the robots rules 
parser. 
+   * Test that the robots rules are interpreted correctly by the robots rules
+   * parser.
    */
   @Test
   public void testRobotsAgent() {
-    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, SINGLE_AGENT);
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, SINGLE_AGENT);
 
-    for(int counter = 0; counter < TEST_PATHS.length; counter++) {
-      Assert.assertTrue("testing on agent (" + SINGLE_AGENT + "), and " 
-          + "path " + TEST_PATHS[counter] 
-              + " got " + rules.isAllowed(TEST_PATHS[counter]),
-              rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+    for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+      Assert.assertTrue(
+          "testing on agent (" + SINGLE_AGENT + "), and " + "path "
+              + TEST_PATHS[counter] + " got "
+              + rules.isAllowed(TEST_PATHS[counter]),
+          rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
     }
 
-    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, MULTIPLE_AGENTS);
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, MULTIPLE_AGENTS);
 
-    for(int counter = 0; counter < TEST_PATHS.length; counter++) {
-      Assert.assertTrue("testing on agents (" + MULTIPLE_AGENTS + "), and " 
-          + "path " + TEST_PATHS[counter] 
-              + " got " + rules.isAllowed(TEST_PATHS[counter]),
-              rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+    for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+      Assert.assertTrue(
+          "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path "
+              + TEST_PATHS[counter] + " got "
+              + rules.isAllowed(TEST_PATHS[counter]),
+          rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
     }
   }
 
   /**
-   * Test that the crawl delay is extracted from the robots file for 
respective agent. 
-   * If its not specified for a given agent, default value must be returned.
+   * Test that the crawl delay is extracted from the robots file for respective
+   * agent. If its not specified for a given agent, default value must be
+   * returned.
    */
   @Test
   public void testCrawlDelay() {
-    // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be 
returned by the parser
-    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, SINGLE_AGENT);
-    Assert.assertTrue("testing crawl delay for agent "+ SINGLE_AGENT +" : ", 
(rules.getCrawlDelay() == 10000));
+    // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
+    // returned by the parser
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, SINGLE_AGENT);
+    Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ",
+        (rules.getCrawlDelay() == 10000));
 
     // for UNKNOWN_AGENT, the default crawl delay must be returned.
-    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, UNKNOWN_AGENT);
-    Assert.assertTrue("testing crawl delay for agent "+ UNKNOWN_AGENT +" : ", 
(rules.getCrawlDelay() == Long.MIN_VALUE));
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, UNKNOWN_AGENT);
+    Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ",
+        (rules.getCrawlDelay() == Long.MIN_VALUE));
   }
 }

Modified: 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
 Thu Jan 29 05:38:59 2015
@@ -16,11 +16,9 @@
  */
 package org.apache.nutch.urlfilter.api;
 
-
-
 /**
  * A generic regular expression rule.
- *
+ * 
  * @author J&eacute;r&ocirc;me Charron
  */
 public abstract class RegexRule {
@@ -29,13 +27,15 @@ public abstract class RegexRule {
 
   /**
    * Constructs a new regular expression rule.
-   *
-   * @param sign specifies if this rule must filter-in or filter-out.
-   *        A <code>true</code> value means that any url matching this rule
-   *        must be accepted, a <code>false</code> value means that any url
-   *        matching this rule must be rejected.
-   * @param regex is the regular expression used for matching (see
-   *        {@link #match(String)} method).
+   * 
+   * @param sign
+   *          specifies if this rule must filter-in or filter-out. A
+   *          <code>true</code> value means that any url matching this rule 
must
+   *          be accepted, a <code>false</code> value means that any url
+   *          matching this rule must be rejected.
+   * @param regex
+   *          is the regular expression used for matching (see
+   *          {@link #match(String)} method).
    */
   protected RegexRule(boolean sign, String regex) {
     this.sign = sign;
@@ -43,19 +43,22 @@ public abstract class RegexRule {
 
   /**
    * Return if this rule is used for filtering-in or out.
-   *
+   * 
    * @return <code>true</code> if any url matching this rule must be accepted,
    *         otherwise <code>false</code>.
    */
-  protected boolean accept() { return sign; }
-  
+  protected boolean accept() {
+    return sign;
+  }
+
   /**
    * Checks if a url matches this rule.
-   * @param url is the url to check.
-   * @return <code>true</code> if the specified url matches this rule,
-   *         otherwise <code>false</code>.
+   * 
+   * @param url
+   *          is the url to check.
+   * @return <code>true</code> if the specified url matches this rule, 
otherwise
+   *         <code>false</code>.
    */
   protected abstract boolean match(String url);
 
 }
-

Modified: 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
 Thu Jan 29 05:38:59 2015
@@ -37,28 +37,32 @@ import org.apache.hadoop.conf.Configurat
 // Nutch imports
 import org.apache.nutch.net.*;
 
-
 /**
- * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on
- * regular expressions.
- *
- * <p>The regular expressions rules are expressed in a file. The file of rules
- * is determined for each implementation using the
- * {@link #getRulesReader(Configuration conf)} method.</p>
+ * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular
+ * expressions.
+ * 
+ * <p>
+ * The regular expressions rules are expressed in a file. The file of rules is
+ * determined for each implementation using the
+ * {@link #getRulesReader(Configuration conf)} method.
+ * </p>
  * 
- * <p>The format of this file is made of many rules (one per line):<br/>
+ * <p>
+ * The format of this file is made of many rules (one per line):<br/>
  * <code>
  * [+-]&lt;regex&gt;
  * </code><br/>
- * where plus (<code>+</code>)means go ahead and index it and minus 
- * (<code>-</code>)means no.</p>
- *
+ * where plus (<code>+</code>)means go ahead and index it and minus (
+ * <code>-</code>)means no.
+ * </p>
+ * 
  * @author J&eacute;r&ocirc;me Charron
  */
 public abstract class RegexURLFilterBase implements URLFilter {
 
   /** My logger */
-  private final static Logger LOG = 
LoggerFactory.getLogger(RegexURLFilterBase.class);
+  private final static Logger LOG = LoggerFactory
+      .getLogger(RegexURLFilterBase.class);
 
   /** An array of applicable rules */
   private List<RegexRule> rules;
@@ -66,24 +70,28 @@ public abstract class RegexURLFilterBase
   /** The current configuration */
   private Configuration conf;
 
-
   /**
    * Constructs a new empty RegexURLFilterBase
    */
-  public RegexURLFilterBase() { }
+  public RegexURLFilterBase() {
+  }
 
   /**
    * Constructs a new RegexURLFilter and init it with a file of rules.
-   * @param filename is the name of rules file.
+   * 
+   * @param filename
+   *          is the name of rules file.
    */
-  public RegexURLFilterBase(File filename)
-    throws IOException, IllegalArgumentException {
+  public RegexURLFilterBase(File filename) throws IOException,
+      IllegalArgumentException {
     this(new FileReader(filename));
   }
-  
+
   /**
    * Constructs a new RegexURLFilter and inits it with a list of rules.
-   * @param rules string with a list of rules, one rule per line
+   * 
+   * @param rules
+   *          string with a list of rules, one rule per line
    * @throws IOException
    * @throws IllegalArgumentException
    */
@@ -94,68 +102,82 @@ public abstract class RegexURLFilterBase
 
   /**
    * Constructs a new RegexURLFilter and init it with a Reader of rules.
-   * @param reader is a reader of rules.
+   * 
+   * @param reader
+   *          is a reader of rules.
    */
-  protected RegexURLFilterBase(Reader reader)
-    throws IOException, IllegalArgumentException {
+  protected RegexURLFilterBase(Reader reader) throws IOException,
+      IllegalArgumentException {
     rules = readRules(reader);
   }
-  
+
   /**
    * Creates a new {@link RegexRule}.
-   * @param sign of the regular expression.
-   *        A <code>true</code> value means that any URL matching this rule
-   *        must be included, whereas a <code>false</code>
-   *        value means that any URL matching this rule must be excluded.
-   * @param regex is the regular expression associated to this rule.
+   * 
+   * @param sign
+   *          of the regular expression. A <code>true</code> value means that
+   *          any URL matching this rule must be included, whereas a
+   *          <code>false</code> value means that any URL matching this rule
+   *          must be excluded.
+   * @param regex
+   *          is the regular expression associated to this rule.
    */
   protected abstract RegexRule createRule(boolean sign, String regex);
-  
+
   /**
-   * Returns the name of the file of rules to use for
-   * a particular implementation.
-   * @param conf is the current configuration.
+   * Returns the name of the file of rules to use for a particular
+   * implementation.
+   * 
+   * @param conf
+   *          is the current configuration.
    * @return the name of the resource containing the rules to use.
    */
-  protected abstract Reader getRulesReader(Configuration conf) throws 
IOException;
-  
-  
-  /* -------------------------- *
-   * <implementation:URLFilter> *
-   * -------------------------- */
-  
+  protected abstract Reader getRulesReader(Configuration conf)
+      throws IOException;
+
+  /*
+   * -------------------------- * <implementation:URLFilter> *
+   * --------------------------
+   */
+
   // Inherited Javadoc
   public String filter(String url) {
     for (RegexRule rule : rules) {
       if (rule.match(url)) {
         return rule.accept() ? url : null;
       }
-    };
+    }
+    ;
     return null;
   }
 
-  /* --------------------------- *
-   * </implementation:URLFilter> *
-   * --------------------------- */
-  
-  
-  /* ----------------------------- *
-   * <implementation:Configurable> *
-   * ----------------------------- */
-  
+  /*
+   * --------------------------- * </implementation:URLFilter> *
+   * ---------------------------
+   */
+
+  /*
+   * ----------------------------- * <implementation:Configurable> *
+   * -----------------------------
+   */
+
   public void setConf(Configuration conf) {
     this.conf = conf;
     Reader reader = null;
     try {
       reader = getRulesReader(conf);
     } catch (Exception e) {
-      if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
-      throw new RuntimeException(e.getMessage(), e);      
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.getMessage());
+      }
+      throw new RuntimeException(e.getMessage(), e);
     }
     try {
       rules = readRules(reader);
     } catch (IOException e) {
-      if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.getMessage());
+      }
       throw new RuntimeException(e.getMessage(), e);
     }
   }
@@ -163,45 +185,51 @@ public abstract class RegexURLFilterBase
   public Configuration getConf() {
     return this.conf;
   }
-  
-  /* ------------------------------ *
-   * </implementation:Configurable> *
-   * ------------------------------ */
-  
+
+  /*
+   * ------------------------------ * </implementation:Configurable> *
+   * ------------------------------
+   */
 
   /**
    * Read the specified file of rules.
-   * @param reader is a reader of regular expressions rules.
+   * 
+   * @param reader
+   *          is a reader of regular expressions rules.
    * @return the corresponding {@RegexRule rules}.
    */
-  private List<RegexRule> readRules(Reader reader)
-    throws IOException, IllegalArgumentException {
+  private List<RegexRule> readRules(Reader reader) throws IOException,
+      IllegalArgumentException {
 
     BufferedReader in = new BufferedReader(reader);
     List<RegexRule> rules = new ArrayList<RegexRule>();
     String line;
-       
-    while((line=in.readLine())!=null) {
+
+    while ((line = in.readLine()) != null) {
       if (line.length() == 0) {
         continue;
       }
-      char first=line.charAt(0);
-      boolean sign=false;
+      char first = line.charAt(0);
+      boolean sign = false;
       switch (first) {
-      case '+' : 
-        sign=true;
+      case '+':
+        sign = true;
         break;
-      case '-' :
-        sign=false;
+      case '-':
+        sign = false;
         break;
-      case ' ' : case '\n' : case '#' :           // skip blank & comment lines
+      case ' ':
+      case '\n':
+      case '#': // skip blank & comment lines
         continue;
-      default :
-        throw new IOException("Invalid first character: "+line);
+      default:
+        throw new IOException("Invalid first character: " + line);
       }
 
       String regex = line.substring(1);
-      if (LOG.isTraceEnabled()) { LOG.trace("Adding rule [" + regex + "]"); }
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Adding rule [" + regex + "]");
+      }
       RegexRule rule = createRule(sign, regex);
       rules.add(rule);
     }
@@ -210,18 +238,20 @@ public abstract class RegexURLFilterBase
 
   /**
    * Filter the standard input using a RegexURLFilterBase.
-   * @param filter is the RegexURLFilterBase to use for filtering the
-   *        standard input.
-   * @param args some optional parameters (not used).
+   * 
+   * @param filter
+   *          is the RegexURLFilterBase to use for filtering the standard 
input.
+   * @param args
+   *          some optional parameters (not used).
    */
   public static void main(RegexURLFilterBase filter, String args[])
-    throws IOException, IllegalArgumentException {
+      throws IOException, IllegalArgumentException {
 
     BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
     String line;
-    while((line=in.readLine())!=null) {
+    while ((line = in.readLine()) != null) {
       String out = filter.filter(line);
-      if (out!=null) {
+      if (out != null) {
         System.out.print("+");
         System.out.println(out);
       } else {

Modified: 
nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
 Thu Jan 29 05:38:59 2015
@@ -31,26 +31,25 @@ import org.slf4j.LoggerFactory;
 // Nutch imports
 import org.apache.nutch.net.URLFilter;
 
-
 /**
  * JUnit based test of class <code>RegexURLFilterBase</code>.
- *
+ * 
  * @author J&eacute;r&ocirc;me Charron
  */
 public abstract class RegexURLFilterBaseTest {
 
   /** My logger */
-  protected static final Logger LOG = 
LoggerFactory.getLogger(RegexURLFilterBaseTest.class);  
+  protected static final Logger LOG = LoggerFactory
+      .getLogger(RegexURLFilterBaseTest.class);
 
-  private final static String SEPARATOR = 
System.getProperty("file.separator");  
+  private final static String SEPARATOR = System.getProperty("file.separator");
   private final static String SAMPLES = System.getProperty("test.data", ".");
 
   protected abstract URLFilter getURLFilter(Reader rules);
 
   protected void bench(int loops, String file) {
     try {
-      bench(loops,
-          new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+      bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
           new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
     } catch (Exception e) {
       Assert.fail(e.toString());
@@ -62,14 +61,14 @@ public abstract class RegexURLFilterBase
     try {
       URLFilter filter = getURLFilter(rules);
       FilteredURL[] expected = readURLFile(urls);
-      for (int i=0; i<loops; i++) {
+      for (int i = 0; i < loops; i++) {
         test(filter, expected);
       }
     } catch (Exception e) {
       Assert.fail(e.toString());
     }
-    LOG.info("bench time (" + loops + ") " +
-        (System.currentTimeMillis()-start) + "ms");
+    LOG.info("bench time (" + loops + ") "
+        + (System.currentTimeMillis() - start) + "ms");
   }
 
   protected void test(String file) {
@@ -90,7 +89,7 @@ public abstract class RegexURLFilterBase
   }
 
   protected void test(URLFilter filter, FilteredURL[] expected) {
-    for (int i=0; i<expected.length; i++) {
+    for (int i = 0; i < expected.length; i++) {
       String result = filter.filter(expected[i].url);
       if (result != null) {
         Assert.assertTrue(expected[i].url, expected[i].sign);
@@ -104,7 +103,7 @@ public abstract class RegexURLFilterBase
     BufferedReader in = new BufferedReader(reader);
     List<FilteredURL> list = new ArrayList<FilteredURL>();
     String line;
-    while((line=in.readLine()) != null) {
+    while ((line = in.readLine()) != null) {
       if (line.length() != 0) {
         list.add(new FilteredURL(line));
       }
@@ -119,13 +118,13 @@ public abstract class RegexURLFilterBase
 
     FilteredURL(String line) {
       switch (line.charAt(0)) {
-      case '+' : 
+      case '+':
         sign = true;
         break;
-      case '-' :
+      case '-':
         sign = false;
         break;
-      default :
+      default:
         // Simply ignore...
       }
       url = line.substring(1);

Modified: 
nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
 Thu Jan 29 05:38:59 2015
@@ -16,7 +16,6 @@
  */
 package org.apache.nutch.microformats.reltag;
 
-
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
@@ -29,29 +28,27 @@ import org.apache.nutch.parse.Parse;
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
 
-
 /**
- * An {@link org.apache.nutch.indexer.IndexingFilter} that 
- * add <code>tag</code> field(s) to the document.
- *
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that add <code>tag</code>
+ * field(s) to the document.
+ * 
  * @see <a href="http://www.microformats.org/wiki/rel-tag";>
  *      http://www.microformats.org/wiki/rel-tag</a>
  * @author J&eacute;r&ocirc;me Charron
  */
 public class RelTagIndexingFilter implements IndexingFilter {
-  
 
   private Configuration conf;
 
-
   // Inherited JavaDoc
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url, 
CrawlDatum datum, Inlinks inlinks)
-    throws IndexingException {
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
 
     // Check if some Rel-Tags found, possibly put there by RelTagParser
-    String[] tags = 
parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG);
+    String[] tags = parse.getData().getParseMeta()
+        .getValues(RelTagParser.REL_TAG);
     if (tags != null) {
-      for (int i=0; i<tags.length; i++) {
+      for (int i = 0; i < tags.length; i++) {
         doc.add("tag", tags[i]);
       }
     }
@@ -59,10 +56,11 @@ public class RelTagIndexingFilter implem
     return doc;
   }
 
-  /* ----------------------------- *
-   * <implementation:Configurable> *
-   * ----------------------------- */
-  
+  /*
+   * ----------------------------- * <implementation:Configurable> *
+   * -----------------------------
+   */
+
   public void setConf(Configuration conf) {
     this.conf = conf;
   }
@@ -70,9 +68,10 @@ public class RelTagIndexingFilter implem
   public Configuration getConf() {
     return this.conf;
   }
-  
-  /* ------------------------------ *
-   * </implementation:Configurable> *
-   * ------------------------------ */
-  
+
+  /*
+   * ------------------------------ * </implementation:Configurable> *
+   * ------------------------------
+   */
+
 }

Modified: 
nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
 (original)
+++ 
nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
 Thu Jan 29 05:38:59 2015
@@ -45,24 +45,24 @@ import org.apache.hadoop.conf.Configurat
 
 /**
  * Adds microformat rel-tags of document if found.
- *
+ * 
  * @see <a href="http://www.microformats.org/wiki/rel-tag";>
  *      http://www.microformats.org/wiki/rel-tag</a>
  */
 public class RelTagParser implements HtmlParseFilter {
-  
+
   public final static Logger LOG = LoggerFactory.getLogger(RelTagParser.class);
 
   public final static String REL_TAG = "Rel-Tag";
-  
+
   private Configuration conf = null;
-  
+
   /**
    * Scan the HTML document looking at possible rel-tags
    */
   public ParseResult filter(Content content, ParseResult parseResult,
-    HTMLMetaTags metaTags, DocumentFragment doc) {
-    
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
     // get parse obj
     Parse parse = parseResult.get(content.getUrl());
     // Trying to find the document's rel-tags
@@ -79,16 +79,16 @@ public class RelTagParser implements Htm
   private static class Parser {
 
     Set<String> tags = null;
-    
+
     Parser(Node node) {
       tags = new TreeSet<String>();
       parse(node);
     }
-  
+
     Set<String> getRelTags() {
       return tags;
     }
-    
+
     void parse(Node node) {
 
       if (node.getNodeType() == Node.ELEMENT_NODE) {
@@ -105,7 +105,7 @@ public class RelTagParser implements Htm
               if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
                 String tag = parseTag(hrefNode.getNodeValue());
                 if (!StringUtil.isEmpty(tag)) {
-                  if(!tags.contains(tag)){
+                  if (!tags.contains(tag)) {
                     tags.add(tag);
                     LOG.debug("Adding tag: " + tag + " to tag set.");
                   }
@@ -115,26 +115,27 @@ public class RelTagParser implements Htm
           }
         }
       }
-      
+
       // Recurse
       NodeList children = node.getChildNodes();
-      for (int i=0; children != null && i<children.getLength(); i++)
+      for (int i = 0; children != null && i < children.getLength(); i++)
         parse(children.item(i));
     }
-    
+
     private final static String parseTag(String url) {
       String tag = null;
       try {
         URL u = new URL(url);
         String path = u.getPath();
-        tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), 
"UTF-8");
+        tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1),
+            "UTF-8");
       } catch (Exception e) {
         // Malformed tag...
         tag = null;
       }
       return tag;
     }
-    
+
   }
 
   public void setConf(Configuration conf) {

Modified: 
nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
 Thu Jan 29 05:38:59 2015
@@ -50,18 +50,21 @@ import java.nio.charset.Charset;
 
 public class ExtParser implements Parser {
 
-  public static final Logger LOG = 
LoggerFactory.getLogger("org.apache.nutch.parse.ext");
+  public static final Logger LOG = LoggerFactory
+      .getLogger("org.apache.nutch.parse.ext");
 
   static final int BUFFER_SIZE = 4096;
 
   static final int TIMEOUT_DEFAULT = 30; // in seconds
 
-  // handy map from String contentType to String[] {command, timeoutString, 
encoding}
+  // handy map from String contentType to String[] {command, timeoutString,
+  // encoding}
   Hashtable<String, String[]> TYPE_PARAMS_MAP = new Hashtable<String, 
String[]>();
 
-  private Configuration conf;  
+  private Configuration conf;
 
-  public ExtParser () { }
+  public ExtParser() {
+  }
 
   public ParseResult getParse(Content content) {
 
@@ -70,14 +73,15 @@ public class ExtParser implements Parser
     String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
     if (params == null)
       return new ParseStatus(ParseStatus.FAILED,
-                      "No external command defined for contentType: " + 
contentType).getEmptyParseResult(content.getUrl(), getConf());
+          "No external command defined for contentType: " + contentType)
+          .getEmptyParseResult(content.getUrl(), getConf());
 
     String command = params[0];
     int timeout = Integer.parseInt(params[1]);
     String encoding = params[2];
 
     if (LOG.isTraceEnabled()) {
-      LOG.trace("Use "+command+ " with timeout="+timeout+"secs");
+      LOG.trace("Use " + command + " with timeout=" + timeout + "secs");
     }
 
     String text = null;
@@ -89,19 +93,19 @@ public class ExtParser implements Parser
 
       String contentLength = 
content.getMetadata().get(Response.CONTENT_LENGTH);
       if (contentLength != null
-            && raw.length != Integer.parseInt(contentLength)) {
-          return new ParseStatus(ParseStatus.FAILED, 
ParseStatus.FAILED_TRUNCATED,
-                "Content truncated at " + raw.length
-            +" bytes. Parser can't handle incomplete "
-            + contentType + " file.").getEmptyParseResult(content.getUrl(), 
getConf());
+          && raw.length != Integer.parseInt(contentLength)) {
+        return new ParseStatus(ParseStatus.FAILED,
+            ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length
+                + " bytes. Parser can't handle incomplete " + contentType
+                + " file.").getEmptyParseResult(content.getUrl(), getConf());
       }
 
       ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
-      ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE/4);
+      ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4);
 
       CommandRunner cr = new CommandRunner();
 
-      cr.setCommand(command+ " " +contentType);
+      cr.setCommand(command + " " + contentType);
       cr.setInputStream(new ByteArrayInputStream(raw));
       cr.setStdOutputStream(os);
       cr.setStdErrorStream(es);
@@ -111,14 +115,15 @@ public class ExtParser implements Parser
       cr.evaluate();
 
       if (cr.getExitValue() != 0)
-        return new ParseStatus(ParseStatus.FAILED,
-                        "External command " + command
-                        + " failed with error: " + 
es.toString()).getEmptyParseResult(content.getUrl(), getConf());
+        return new ParseStatus(ParseStatus.FAILED, "External command "
+            + command + " failed with error: " + es.toString())
+            .getEmptyParseResult(content.getUrl(), getConf());
 
       text = os.toString(encoding);
 
     } catch (Exception e) { // run time exception
-      return new ParseStatus(e).getEmptyParseResult(content.getUrl(), 
getConf());
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
     }
 
     if (text == null)
@@ -131,15 +136,15 @@ public class ExtParser implements Parser
     Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
 
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
-                                        outlinks, content.getMetadata());
-    return ParseResult.createParseResult(content.getUrl(), 
-                                         new ParseImpl(text, parseData));
+        outlinks, content.getMetadata());
+    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text,
+        parseData));
   }
-  
+
   public void setConf(Configuration conf) {
     this.conf = conf;
-    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
-        "org.apache.nutch.parse.Parser").getExtensions();
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint("org.apache.nutch.parse.Parser").getExtensions();
 
     String contentType, command, timeoutString, encoding;
 
@@ -161,13 +166,14 @@ public class ExtParser implements Parser
       // null encoding means default
       encoding = extension.getAttribute("encoding");
       if (encoding == null)
-          encoding = Charset.defaultCharset().name();
+        encoding = Charset.defaultCharset().name();
 
       timeoutString = extension.getAttribute("timeout");
       if (timeoutString == null || timeoutString.equals(""))
         timeoutString = "" + TIMEOUT_DEFAULT;
 
-      TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString, 
encoding });
+      TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString,
+          encoding });
     }
   }
 

Modified: 
nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java
 Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
  * Parse wrapper to run external command to do the parsing.
  */
 package org.apache.nutch.parse.ext;
+

Modified: 
nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
 Thu Jan 29 05:38:59 2015
@@ -37,15 +37,14 @@ import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 
-/** 
- * Unit tests for ExtParser.
- * First creates a temp file with fixed content, then fetch
- * and parse it using external command 'cat' and 'md5sum' alternately
- * for 10 times. Doing so also does a light stress test for class
- * CommandRunner.java (as used in ExtParser.java).
- *
+/**
+ * Unit tests for ExtParser. First creates a temp file with fixed content, then
+ * fetch and parse it using external command 'cat' and 'md5sum' alternately for
+ * 10 times. Doing so also does a light stress test for class 
CommandRunner.java
+ * (as used in ExtParser.java).
+ * 
  * Warning: currently only do test on linux platform.
- *
+ * 
  * @author John Xing
  */
 public class TestExtParser {
@@ -67,10 +66,11 @@ public class TestExtParser {
       File tempDir = new File(path);
       if (!tempDir.exists())
         tempDir.mkdir();
-      tempFile = 
File.createTempFile("nutch.test.plugin.ExtParser.",".txt",tempDir);
+      tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt",
+          tempDir);
     } else {
       // otherwise in java.io.tmpdir
-      tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt");
+      tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt");
     }
     urlString = tempFile.toURI().toURL().toString();
 
@@ -79,8 +79,10 @@ public class TestExtParser {
     fos.close();
 
     // get nutch content
-    Protocol protocol = new 
ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
-    content = protocol.getProtocolOutput(new Text(urlString), new 
CrawlDatum()).getContent();
+    Protocol protocol = new ProtocolFactory(NutchConfiguration.create())
+        .getProtocol(urlString);
+    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
+        .getContent();
     protocol = null;
   }
 
@@ -90,8 +92,8 @@ public class TestExtParser {
     content = null;
 
     // clean temp file
-    //if (tempFile != null && tempFile.exists())
-    //  tempFile.delete();
+    // if (tempFile != null && tempFile.exists())
+    // tempFile.delete();
   }
 
   @Test
@@ -100,24 +102,27 @@ public class TestExtParser {
 
     // now test only on linux platform
     if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
-      System.err.println("Current OS is "+System.getProperty("os.name")+".");
+      System.err
+          .println("Current OS is " + System.getProperty("os.name") + ".");
       System.err.println("No test is run on OS other than linux.");
       return;
     }
 
     Configuration conf = NutchConfiguration.create();
     // loop alternately, total 10*2 times of invoking external command
-    for (int i=0; i<10; i++) {
+    for (int i = 0; i < 10; i++) {
       // check external parser that does 'cat'
       contentType = "application/vnd.nutch.example.cat";
       content.setContentType(contentType);
-      parse = new ParseUtil(conf).parseByExtensionId("parse-ext", 
content).get(content.getUrl());
-      Assert.assertEquals(expectedText,parse.getText());
+      parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
+          content.getUrl());
+      Assert.assertEquals(expectedText, parse.getText());
 
       // check external parser that does 'md5sum'
       contentType = "application/vnd.nutch.example.md5sum";
       content.setContentType(contentType);
-      parse = new ParseUtil(conf).parseByExtensionId("parse-ext", 
content).get(content.getUrl());
+      parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
+          content.getUrl());
       Assert.assertTrue(parse.getText().startsWith(expectedMD5sum));
     }
   }

svn commit: r1655526 [15/26] - in /nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/p...

Reply via email to