p...

lewismc Wed, 28 Jan 2015 21:40:02 -0800

Modified: 
nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
 Thu Jan 29 05:38:59 2015
@@ -67,52 +67,52 @@ import org.apache.nutch.parse.Parse;
  */
 public class URLMetaIndexingFilter implements IndexingFilter {
 
-       private static final Logger LOG = LoggerFactory
-                       .getLogger(URLMetaIndexingFilter.class);
-       private static final String CONF_PROPERTY = "urlmeta.tags";
-       private static String[] urlMetaTags;
-       private Configuration conf;
-
-       /**
-        * This will take the metatags that you have listed in your 
"urlmeta.tags"
-        * property, and looks for them inside the CrawlDatum object. If they 
exist,
-        * this will add it as an attribute inside the NutchDocument.
-        * 
-        * @see IndexingFilter#filter
-        */
-       public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
-                       CrawlDatum datum, Inlinks inlinks) throws 
IndexingException {
-               if (conf != null)
-                       this.setConf(conf);
-
-               if (urlMetaTags == null || doc == null)
-                       return doc;
-
-               for (String metatag : urlMetaTags) {
-                       Text metadata = (Text) datum.getMetaData().get(new 
Text(metatag));
-
-                       if (metadata != null)
-                               doc.add(metatag, metadata.toString());
-               }
-
-               return doc;
-       }
-
-       /** Boilerplate */
-       public Configuration getConf() {
-               return conf;
-       }
-
-       /**
-        * handles conf assignment and pulls the value assignment from the
-        * "urlmeta.tags" property
-        */
-       public void setConf(Configuration conf) {
-               this.conf = conf;
+  private static final Logger LOG = LoggerFactory
+      .getLogger(URLMetaIndexingFilter.class);
+  private static final String CONF_PROPERTY = "urlmeta.tags";
+  private static String[] urlMetaTags;
+  private Configuration conf;
+
+  /**
+   * This will take the metatags that you have listed in your "urlmeta.tags"
+   * property, and looks for them inside the CrawlDatum object. If they exist,
+   * this will add it as an attribute inside the NutchDocument.
+   * 
+   * @see IndexingFilter#filter
+   */
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+    if (conf != null)
+      this.setConf(conf);
+
+    if (urlMetaTags == null || doc == null)
+      return doc;
+
+    for (String metatag : urlMetaTags) {
+      Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
+
+      if (metadata != null)
+        doc.add(metatag, metadata.toString());
+    }
+
+    return doc;
+  }
+
+  /** Boilerplate */
+  public Configuration getConf() {
+    return conf;
+  }
+
+  /**
+   * handles conf assignment and pulls the value assignment from the
+   * "urlmeta.tags" property
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
 
-               if (conf == null)
-                       return;
+    if (conf == null)
+      return;
 
-               urlMetaTags = conf.getStrings(CONF_PROPERTY);
-       }
+    urlMetaTags = conf.getStrings(CONF_PROPERTY);
+  }
 }


Modified: 
nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
 Thu Jan 29 05:38:59 2015
@@ -43,7 +43,8 @@ import org.apache.nutch.scoring.ScoringF
  */
 public class URLMetaScoringFilter extends Configured implements ScoringFilter {
 
-  private static final Logger LOG = 
LoggerFactory.getLogger(URLMetaScoringFilter.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(URLMetaScoringFilter.class);
   private static final String CONF_PROPERTY = "urlmeta.tags";
   private static String[] urlMetaTags;
   private Configuration conf;
@@ -73,8 +74,8 @@ public class URLMetaScoringFilter extend
         if (metaFromParse == null)
           continue;
 
-        nextTarget.getValue().getMetaData().put(new Text(metatag),
-            new Text(metaFromParse));
+        nextTarget.getValue().getMetaData()
+            .put(new Text(metatag), new Text(metaFromParse));
       }
     }
     return adjust;

Modified: 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 Thu Jan 29 05:38:59 2015
@@ -39,185 +39,180 @@ import org.apache.oro.text.regex.*;
  * </ul>
  */
 public class BasicURLNormalizer extends Configured implements URLNormalizer {
-    public static final Logger LOG = 
LoggerFactory.getLogger(BasicURLNormalizer.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(BasicURLNormalizer.class);
 
-    private Perl5Compiler compiler = new Perl5Compiler();
-    private ThreadLocal<Perl5Matcher> matchers = new 
ThreadLocal<Perl5Matcher>() {
-        protected Perl5Matcher initialValue() {
-          return new Perl5Matcher();
-        }
-      };
-    private final Rule relativePathRule;
-    private final Rule leadingRelativePathRule;
-    private final Rule currentPathRule;
-    private final Rule adjacentSlashRule;
-    
-    private final static java.util.regex.Pattern hasNormalizablePattern = 
java.util.regex.Pattern.compile("/\\.?\\.?/");
-
-    private Configuration conf;
-
-    public BasicURLNormalizer() {
-      try {
-        // this pattern tries to find spots like "/xx/../" in the url, which
-        // could be replaced by "/" xx consists of chars, different then "/"
-        // (slash) and needs to have at least one char different from "."
-        relativePathRule = new Rule();
-        relativePathRule.pattern = (Perl5Pattern)
-          compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)",
-                           Perl5Compiler.READ_ONLY_MASK);
-        relativePathRule.substitution = new Perl5Substitution("/");
-
-        // this pattern tries to find spots like leading "/../" in the url,
-        // which could be replaced by "/"
-        leadingRelativePathRule = new Rule();
-        leadingRelativePathRule.pattern = (Perl5Pattern)
-          compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
-        leadingRelativePathRule.substitution = new Perl5Substitution("/");
-
-        // this pattern tries to find spots like "/./" in the url,
-        // which could be replaced by "/"
-        currentPathRule = new Rule();
-        currentPathRule.pattern = (Perl5Pattern)
-          compiler.compile("(/\\./)", Perl5Compiler.READ_ONLY_MASK);
-        currentPathRule.substitution = new Perl5Substitution("/");
-
-        // this pattern tries to find spots like "xx//yy" in the url,
-        // which could be replaced by a "/"
-        adjacentSlashRule = new Rule();
-        adjacentSlashRule.pattern = (Perl5Pattern)      
-          compiler.compile("/{2,}", Perl5Compiler.READ_ONLY_MASK);     
-        adjacentSlashRule.substitution = new Perl5Substitution("/");
-        
-      } catch (MalformedPatternException e) {
-        throw new RuntimeException(e);
-      }
+  private Perl5Compiler compiler = new Perl5Compiler();
+  private ThreadLocal<Perl5Matcher> matchers = new ThreadLocal<Perl5Matcher>() 
{
+    protected Perl5Matcher initialValue() {
+      return new Perl5Matcher();
+    }
+  };
+  private final Rule relativePathRule;
+  private final Rule leadingRelativePathRule;
+  private final Rule currentPathRule;
+  private final Rule adjacentSlashRule;
+
+  private final static java.util.regex.Pattern hasNormalizablePattern = 
java.util.regex.Pattern
+      .compile("/\\.?\\.?/");
+
+  private Configuration conf;
+
+  public BasicURLNormalizer() {
+    try {
+      // this pattern tries to find spots like "/xx/../" in the url, which
+      // could be replaced by "/" xx consists of chars, different then "/"
+      // (slash) and needs to have at least one char different from "."
+      relativePathRule = new Rule();
+      relativePathRule.pattern = (Perl5Pattern) compiler.compile(
+          "(/[^/]*[^/.]{1}[^/]*/\\.\\./)", Perl5Compiler.READ_ONLY_MASK);
+      relativePathRule.substitution = new Perl5Substitution("/");
+
+      // this pattern tries to find spots like leading "/../" in the url,
+      // which could be replaced by "/"
+      leadingRelativePathRule = new Rule();
+      leadingRelativePathRule.pattern = (Perl5Pattern) compiler.compile(
+          "^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
+      leadingRelativePathRule.substitution = new Perl5Substitution("/");
+
+      // this pattern tries to find spots like "/./" in the url,
+      // which could be replaced by "/"
+      currentPathRule = new Rule();
+      currentPathRule.pattern = (Perl5Pattern) compiler.compile("(/\\./)",
+          Perl5Compiler.READ_ONLY_MASK);
+      currentPathRule.substitution = new Perl5Substitution("/");
+
+      // this pattern tries to find spots like "xx//yy" in the url,
+      // which could be replaced by a "/"
+      adjacentSlashRule = new Rule();
+      adjacentSlashRule.pattern = (Perl5Pattern) compiler.compile("/{2,}",
+          Perl5Compiler.READ_ONLY_MASK);
+      adjacentSlashRule.substitution = new Perl5Substitution("/");
+
+    } catch (MalformedPatternException e) {
+      throw new RuntimeException(e);
     }
+  }
 
-    public String normalize(String urlString, String scope)
-            throws MalformedURLException {
-        if ("".equals(urlString))                     // permit empty
-            return urlString;
-
-        urlString = urlString.trim();                 // remove extra spaces
-
-        URL url = new URL(urlString);
-
-        String protocol = url.getProtocol();
-        String host = url.getHost();
-        int port = url.getPort();
-        String file = url.getFile();
-
-        boolean changed = false;
-
-        if (!urlString.startsWith(protocol))        // protocol was lowercased
-            changed = true;
-
-        if ("http".equals(protocol) || "https".equals(protocol) || 
"ftp".equals(protocol)) {
-
-            if (host != null) {
-                String newHost = host.toLowerCase();    // lowercase host
-                if (!host.equals(newHost)) {
-                    host = newHost;
-                    changed = true;
-                }
-            }
-
-            if (port == url.getDefaultPort()) {       // uses default port
-                port = -1;                              // so don't specify it
-                changed = true;
-            }
-
-            if (file == null || "".equals(file)) {    // add a slash
-                file = "/";
-                changed = true;
-            }
-
-            if (url.getRef() != null) {                 // remove the ref
-                changed = true;
-            }
-
-            // check for unnecessary use of "/../"
-            String file2 = substituteUnnecessaryRelativePaths(file);
-
-            if (!file.equals(file2)) {
-                changed = true;
-                file = file2;
-            }
+  public String normalize(String urlString, String scope)
+      throws MalformedURLException {
+    if ("".equals(urlString)) // permit empty
+      return urlString;
 
-        }
+    urlString = urlString.trim(); // remove extra spaces
 
-        if (changed)
-            urlString = new URL(protocol, host, port, file).toString();
+    URL url = new URL(urlString);
 
-        return urlString;
-    }
+    String protocol = url.getProtocol();
+    String host = url.getHost();
+    int port = url.getPort();
+    String file = url.getFile();
+
+    boolean changed = false;
 
-    private String substituteUnnecessaryRelativePaths(String file) {
-       
-       if (!hasNormalizablePattern.matcher(file).find())
-               return file;
-       
-        String fileWorkCopy = file;
-        int oldLen = file.length();
-        int newLen = oldLen - 1;
-
-        // All substitutions will be done step by step, to ensure that certain
-        // constellations will be normalized, too
-        //
-        // For example: "/aa/bb/../../cc/../foo.html will be normalized in the
-        // following manner:
-        //   "/aa/bb/../../cc/../foo.html"
-        //   "/aa/../cc/../foo.html"
-        //   "/cc/../foo.html"
-        //   "/foo.html"
-        //
-        // The normalization also takes care of leading "/../", which will be
-        // replaced by "/", because this is a rather a sign of bad webserver
-        // configuration than of a wanted link.  For example, urls like
-        // "http://www.foo.com/../"; should return a http 404 error instead of
-        // redirecting to "http://www.foo.com";.
-        //
-        Perl5Matcher matcher = (Perl5Matcher)matchers.get();
-
-        while (oldLen != newLen) {
-            // substitue first occurence of "/xx/../" by "/"
-            oldLen = fileWorkCopy.length();
-            fileWorkCopy = Util.substitute
-              (matcher, relativePathRule.pattern,
-               relativePathRule.substitution, fileWorkCopy, 1);
-
-            // remove leading "/../"
-            fileWorkCopy = Util.substitute
-              (matcher, leadingRelativePathRule.pattern,
-               leadingRelativePathRule.substitution, fileWorkCopy, 1);
-
-            // remove unnecessary "/./"
-            fileWorkCopy = Util.substitute
-            (matcher, currentPathRule.pattern,
-                       currentPathRule.substitution, fileWorkCopy, 1);
-            
-            
-            // collapse adjacent slashes with "/"
-            fileWorkCopy = Util.substitute
-            (matcher, adjacentSlashRule.pattern,
-              adjacentSlashRule.substitution, fileWorkCopy, 1);
-            
-            newLen = fileWorkCopy.length();
+    if (!urlString.startsWith(protocol)) // protocol was lowercased
+      changed = true;
+
+    if ("http".equals(protocol) || "https".equals(protocol)
+        || "ftp".equals(protocol)) {
+
+      if (host != null) {
+        String newHost = host.toLowerCase(); // lowercase host
+        if (!host.equals(newHost)) {
+          host = newHost;
+          changed = true;
         }
+      }
+
+      if (port == url.getDefaultPort()) { // uses default port
+        port = -1; // so don't specify it
+        changed = true;
+      }
+
+      if (file == null || "".equals(file)) { // add a slash
+        file = "/";
+        changed = true;
+      }
+
+      if (url.getRef() != null) { // remove the ref
+        changed = true;
+      }
+
+      // check for unnecessary use of "/../"
+      String file2 = substituteUnnecessaryRelativePaths(file);
+
+      if (!file.equals(file2)) {
+        changed = true;
+        file = file2;
+      }
 
-        return fileWorkCopy;
     }
 
+    if (changed)
+      urlString = new URL(protocol, host, port, file).toString();
+
+    return urlString;
+  }
+
+  private String substituteUnnecessaryRelativePaths(String file) {
+
+    if (!hasNormalizablePattern.matcher(file).find())
+      return file;
 
-    /**
-     * Class which holds a compiled pattern and its corresponding substition
-     * string.
-     */
-    private static class Rule {
-        public Perl5Pattern pattern;
-        public Perl5Substitution substitution;
+    String fileWorkCopy = file;
+    int oldLen = file.length();
+    int newLen = oldLen - 1;
+
+    // All substitutions will be done step by step, to ensure that certain
+    // constellations will be normalized, too
+    //
+    // For example: "/aa/bb/../../cc/../foo.html will be normalized in the
+    // following manner:
+    // "/aa/bb/../../cc/../foo.html"
+    // "/aa/../cc/../foo.html"
+    // "/cc/../foo.html"
+    // "/foo.html"
+    //
+    // The normalization also takes care of leading "/../", which will be
+    // replaced by "/", because this is a rather a sign of bad webserver
+    // configuration than of a wanted link. For example, urls like
+    // "http://www.foo.com/../"; should return a http 404 error instead of
+    // redirecting to "http://www.foo.com";.
+    //
+    Perl5Matcher matcher = (Perl5Matcher) matchers.get();
+
+    while (oldLen != newLen) {
+      // substitue first occurence of "/xx/../" by "/"
+      oldLen = fileWorkCopy.length();
+      fileWorkCopy = Util.substitute(matcher, relativePathRule.pattern,
+          relativePathRule.substitution, fileWorkCopy, 1);
+
+      // remove leading "/../"
+      fileWorkCopy = Util.substitute(matcher, leadingRelativePathRule.pattern,
+          leadingRelativePathRule.substitution, fileWorkCopy, 1);
+
+      // remove unnecessary "/./"
+      fileWorkCopy = Util.substitute(matcher, currentPathRule.pattern,
+          currentPathRule.substitution, fileWorkCopy, 1);
+
+      // collapse adjacent slashes with "/"
+      fileWorkCopy = Util.substitute(matcher, adjacentSlashRule.pattern,
+          adjacentSlashRule.substitution, fileWorkCopy, 1);
+
+      newLen = fileWorkCopy.length();
     }
 
+    return fileWorkCopy;
+  }
+
+  /**
+   * Class which holds a compiled pattern and its corresponding substition
+   * string.
+   */
+  private static class Rule {
+    public Perl5Pattern pattern;
+    public Perl5Substitution substitution;
+  }
 
   public void setConf(Configuration conf) {
     this.conf = conf;
@@ -228,4 +223,3 @@ public class BasicURLNormalizer extends
   }
 
 }
-

Modified: 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
 Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
  * and dot segments in path.
  */
 package org.apache.nutch.net.urlnormalizer.basic;
+

Modified: 
nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
 Thu Jan 29 05:38:59 2015
@@ -26,16 +26,15 @@ import org.junit.Test;
 /** Unit tests for BasicURLNormalizer. */
 public class TestBasicURLNormalizer {
   private BasicURLNormalizer normalizer;
-  
+
   private Configuration conf;
-  
+
   public TestBasicURLNormalizer() {
     normalizer = new BasicURLNormalizer();
     conf = NutchConfiguration.create();
     normalizer.setConf(conf);
   }
 
-
   @Test
   public void testNormalizer() throws Exception {
     // check that leading and trailing spaces are removed
@@ -58,59 +57,49 @@ public class TestBasicURLNormalizer {
     // check that references are removed
     normalizeTest("http://foo.com/foo.html#ref";, "http://foo.com/foo.html";);
 
-    //     // check that encoding is normalized
-    //     normalizeTest("http://foo.com/%66oo.html";, 
"http://foo.com/foo.html";);
+    // // check that encoding is normalized
+    // normalizeTest("http://foo.com/%66oo.html";, "http://foo.com/foo.html";);
 
     // check that unnecessary "../" are removed
 
-    normalizeTest("http://foo.com/aa/./foo.html";,
-                  "http://foo.com/aa/foo.html"; );
-    normalizeTest("http://foo.com/aa/../";,
-                  "http://foo.com/"; );
-    normalizeTest("http://foo.com/aa/bb/../";,
-                  "http://foo.com/aa/";);
-    normalizeTest("http://foo.com/aa/..";,
-                  "http://foo.com/aa/..";);
+    normalizeTest("http://foo.com/aa/./foo.html";, 
"http://foo.com/aa/foo.html";);
+    normalizeTest("http://foo.com/aa/../";, "http://foo.com/";);
+    normalizeTest("http://foo.com/aa/bb/../";, "http://foo.com/aa/";);
+    normalizeTest("http://foo.com/aa/..";, "http://foo.com/aa/..";);
     normalizeTest("http://foo.com/aa/bb/cc/../../foo.html";,
-                  "http://foo.com/aa/foo.html";);
+        "http://foo.com/aa/foo.html";);
     normalizeTest("http://foo.com/aa/bb/../cc/dd/../ee/foo.html";,
-                  "http://foo.com/aa/cc/ee/foo.html";);
-    normalizeTest("http://foo.com/../foo.html";,
-                  "http://foo.com/foo.html"; );
-    normalizeTest("http://foo.com/../../foo.html";,
-                  "http://foo.com/foo.html"; );
-    normalizeTest("http://foo.com/../aa/../foo.html";,
-                  "http://foo.com/foo.html"; );
-    normalizeTest("http://foo.com/aa/../../foo.html";,
-                  "http://foo.com/foo.html"; );
+        "http://foo.com/aa/cc/ee/foo.html";);
+    normalizeTest("http://foo.com/../foo.html";, "http://foo.com/foo.html";);
+    normalizeTest("http://foo.com/../../foo.html";, "http://foo.com/foo.html";);
+    normalizeTest("http://foo.com/../aa/../foo.html";, 
"http://foo.com/foo.html";);
+    normalizeTest("http://foo.com/aa/../../foo.html";, 
"http://foo.com/foo.html";);
     normalizeTest("http://foo.com/aa/../bb/../foo.html/../../";,
-                  "http://foo.com/"; );
-    normalizeTest("http://foo.com/../aa/foo.html";,
-                  "http://foo.com/aa/foo.html"; );
-    normalizeTest("http://foo.com/../aa/../foo.html";,
-                  "http://foo.com/foo.html"; );
+        "http://foo.com/";);
+    normalizeTest("http://foo.com/../aa/foo.html";, 
"http://foo.com/aa/foo.html";);
+    normalizeTest("http://foo.com/../aa/../foo.html";, 
"http://foo.com/foo.html";);
     normalizeTest("http://foo.com/a..a/foo.html";,
-                  "http://foo.com/a..a/foo.html"; );
-    normalizeTest("http://foo.com/a..a/../foo.html";,
-                  "http://foo.com/foo.html"; );
+        "http://foo.com/a..a/foo.html";);
+    normalizeTest("http://foo.com/a..a/../foo.html";, 
"http://foo.com/foo.html";);
     normalizeTest("http://foo.com/foo.foo/../foo.html";,
-                  "http://foo.com/foo.html"; );
+        "http://foo.com/foo.html";);
     normalizeTest("http://foo.com//aa/bb/foo.html";,
-                  "http://foo.com/aa/bb/foo.html"; );
+        "http://foo.com/aa/bb/foo.html";);
     normalizeTest("http://foo.com/aa//bb/foo.html";,
-                  "http://foo.com/aa/bb/foo.html"; );
+        "http://foo.com/aa/bb/foo.html";);
     normalizeTest("http://foo.com/aa/bb//foo.html";,
-                  "http://foo.com/aa/bb/foo.html"; );
+        "http://foo.com/aa/bb/foo.html";);
     normalizeTest("http://foo.com//aa//bb//foo.html";,
-                  "http://foo.com/aa/bb/foo.html"; );
+        "http://foo.com/aa/bb/foo.html";);
     normalizeTest("http://foo.com////aa////bb////foo.html";,
-                  "http://foo.com/aa/bb/foo.html"; );
+        "http://foo.com/aa/bb/foo.html";);
   }
 
   private void normalizeTest(String weird, String normal) throws Exception {
-    Assert.assertEquals(normal, normalizer.normalize(weird, 
URLNormalizers.SCOPE_DEFAULT));
+    Assert.assertEquals(normal,
+        normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT));
   }
-  
+
   public static void main(String[] args) throws Exception {
     new TestBasicURLNormalizer().testNormalizer();
   }

Modified: 
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
 Thu Jan 29 05:38:59 2015
@@ -35,34 +35,36 @@ import org.apache.nutch.plugin.PluginRep
 import org.apache.nutch.util.URLUtil;
 
 /**
- * URL normalizer for mapping hosts to their desired form. It takes
- * a simple text file as source in the format:
- *
+ * URL normalizer for mapping hosts to their desired form. It takes a simple
+ * text file as source in the format:
+ * 
  * example.org www.example.org
- *
- * mapping all URL's of example.org the the www sub-domain. It also
- * allows for wildcards to be used to map all sub-domains to another
- * host:
- *
+ * 
+ * mapping all URL's of example.org the the www sub-domain. It also allows for
+ * wildcards to be used to map all sub-domains to another host:
+ * 
  * *.example.org www.example.org
  */
 public class HostURLNormalizer implements URLNormalizer {
 
   private Configuration conf;
 
-  private static final Logger LOG = 
LoggerFactory.getLogger(HostURLNormalizer.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(HostURLNormalizer.class);
 
   private static String attributeFile = null;
   private String hostsFile = null;
-  private static final HashMap<String,String> hostsMap = new 
HashMap<String,String>();
+  private static final HashMap<String, String> hostsMap = new HashMap<String, 
String>();
 
-  public HostURLNormalizer() {}
+  public HostURLNormalizer() {
+  }
 
   public HostURLNormalizer(String hostsFile) {
     this.hostsFile = hostsFile;
   }
 
-  private synchronized void readConfiguration(Reader configReader) throws 
IOException {
+  private synchronized void readConfiguration(Reader configReader)
+      throws IOException {
     if (hostsMap.size() > 0) {
       return;
     }
@@ -92,8 +94,8 @@ public class HostURLNormalizer implement
 
     // get the extensions for domain urlfilter
     String pluginName = "urlnormalizer-host";
-    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
-      URLNormalizer.class.getName()).getExtensions();
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLNormalizer.class.getName()).getExtensions();
     for (int i = 0; i < extensions.length; i++) {
       Extension extension = extensions[i];
       if (extension.getDescriptor().getPluginId().equals(pluginName)) {
@@ -110,13 +112,12 @@ public class HostURLNormalizer implement
     if (attributeFile != null) {
       if (LOG.isInfoEnabled()) {
         LOG.info("Attribute \"file\" is defined for plugin " + pluginName
-          + " as " + attributeFile);
+            + " as " + attributeFile);
       }
-    }
-    else {
+    } else {
       if (LOG.isWarnEnabled()) {
         LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
-          + pluginName);
+            + pluginName);
       }
     }
 
@@ -125,8 +126,7 @@ public class HostURLNormalizer implement
     String stringRules = conf.get("urlnormalizer.hosts.rules");
     if (hostsFile != null) {
       file = hostsFile;
-    }
-    else if (attributeFile != null) {
+    } else if (attributeFile != null) {
       file = attributeFile;
     }
     Reader reader = null;
@@ -140,13 +140,13 @@ public class HostURLNormalizer implement
         reader = new FileReader(file);
       }
       readConfiguration(reader);
-    }
-    catch (IOException e) {
+    } catch (IOException e) {
       LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
     }
   }
 
-  public String normalize(String urlString, String scope) throws 
MalformedURLException {
+  public String normalize(String urlString, String scope)
+      throws MalformedURLException {
     String host = new URL(urlString).getHost();
 
     // Test static hosts
@@ -164,7 +164,7 @@ public class HostURLNormalizer implement
     String wildCardHost = new String();
 
     // Add the tld to the buffer
-    hostBuffer.append(hostParts[hostParts.length -1]);
+    hostBuffer.append(hostParts[hostParts.length - 1]);
 
     for (int i = hostParts.length - 2; i > 0; i--) {
       // Prepend another sub domain

Modified: 
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java
 Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
  * configuration file.
  */
 package org.apache.nutch.net.urlnormalizer.host;
+

Modified: 
nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
 Thu Jan 29 05:38:59 2015
@@ -36,14 +36,22 @@ public class TestHostURLNormalizer {
     normalizer.setConf(conf);
 
     // Force www. sub domain when hitting link without sub domain
-    Assert.assertEquals("http://www.example.org/page.html";, 
normalizer.normalize("http://example.org/page.html";, 
URLNormalizers.SCOPE_DEFAULT));
+    Assert.assertEquals("http://www.example.org/page.html";,
+        normalizer.normalize("http://example.org/page.html";,
+            URLNormalizers.SCOPE_DEFAULT));
 
     // Force no sub domain to www. URL's
-    Assert.assertEquals("http://example.net/path/to/something.html";, 
normalizer.normalize("http://www.example.net/path/to/something.html";, 
URLNormalizers.SCOPE_DEFAULT));
+    Assert.assertEquals("http://example.net/path/to/something.html";, normalizer
+        .normalize("http://www.example.net/path/to/something.html";,
+            URLNormalizers.SCOPE_DEFAULT));
 
     // Force all sub domains to www.
-    Assert.assertEquals("http://example.com/?does=it&still=work";, 
normalizer.normalize("http://example.com/?does=it&still=work";, 
URLNormalizers.SCOPE_DEFAULT));
-    Assert.assertEquals("http://example.com/buh";, 
normalizer.normalize("http://http.www.example.com/buh";, 
URLNormalizers.SCOPE_DEFAULT));
-    Assert.assertEquals("http://example.com/blaat";, 
normalizer.normalize("http://whatever.example.com/blaat";, 
URLNormalizers.SCOPE_DEFAULT));
+    Assert.assertEquals("http://example.com/?does=it&still=work";, normalizer
+        .normalize("http://example.com/?does=it&still=work";,
+            URLNormalizers.SCOPE_DEFAULT));
+    Assert.assertEquals("http://example.com/buh";, normalizer.normalize(
+        "http://http.www.example.com/buh";, URLNormalizers.SCOPE_DEFAULT));
+    Assert.assertEquals("http://example.com/blaat";, normalizer.normalize(
+        "http://whatever.example.com/blaat";, URLNormalizers.SCOPE_DEFAULT));
   }
 }

Modified: 
nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java
 Thu Jan 29 05:38:59 2015
@@ -24,15 +24,17 @@ import org.apache.nutch.net.URLNormalize
 
 /**
  * This URLNormalizer doesn't change urls. It is sometimes useful if for a 
given
- * scope at least one normalizer must be defined but no transformations are 
required.
+ * scope at least one normalizer must be defined but no transformations are
+ * required.
  * 
  * @author Andrzej Bialecki
  */
 public class PassURLNormalizer implements URLNormalizer {
 
   private Configuration conf;
-  
-  public String normalize(String urlString, String scope) throws 
MalformedURLException {
+
+  public String normalize(String urlString, String scope)
+      throws MalformedURLException {
     return urlString;
   }
 
@@ -41,7 +43,7 @@ public class PassURLNormalizer implement
   }
 
   public void setConf(Configuration conf) {
-    this.conf = conf;    
+    this.conf = conf;
   }
 
 }

Modified: 
nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java
 Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
  * one URL normalizer must be defined in any scope.
  */
 package org.apache.nutch.net.urlnormalizer.pass;
+

Modified: 
nutch/trunk/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java
 Thu Jan 29 05:38:59 2015
@@ -29,7 +29,7 @@ public class TestPassURLNormalizer {
   @Test
   public void testPassURLNormalizer() {
     Configuration conf = NutchConfiguration.create();
-    
+
     PassURLNormalizer normalizer = new PassURLNormalizer();
     normalizer.setConf(conf);
     String url = "http://www.example.com/test/..//";;
@@ -39,7 +39,7 @@ public class TestPassURLNormalizer {
     } catch (MalformedURLException mue) {
       Assert.fail(mue.toString());
     }
-    
+
     Assert.assertEquals(url, result);
   }
 }

Modified: 
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
 Thu Jan 29 05:38:59 2015
@@ -33,18 +33,20 @@ import org.apache.nutch.plugin.PluginRep
 import org.apache.nutch.util.URLUtil;
 
 /**
- * URL normalizer plugin for normalizing query strings but sorting
- * query string parameters. Not sorting query strings can lead to large
- * amounts of duplicate URL's such as ?a=x&b=y vs b=y&a=x.
- *
+ * URL normalizer plugin for normalizing query strings but sorting query string
+ * parameters. Not sorting query strings can lead to large amounts of duplicate
+ * URL's such as ?a=x&b=y vs b=y&a=x.
+ * 
  */
 public class QuerystringURLNormalizer implements URLNormalizer {
 
   private Configuration conf;
 
-  private static final Logger LOG = 
LoggerFactory.getLogger(QuerystringURLNormalizer.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(QuerystringURLNormalizer.class);
 
-  public QuerystringURLNormalizer() {}
+  public QuerystringURLNormalizer() {
+  }
 
   public Configuration getConf() {
     return conf;
@@ -54,20 +56,21 @@ public class QuerystringURLNormalizer im
     this.conf = conf;
   }
 
-  public String normalize(String urlString, String scope) throws 
MalformedURLException {
+  public String normalize(String urlString, String scope)
+      throws MalformedURLException {
     URL url = new URL(urlString);
-    
+
     String queryString = url.getQuery();
-    
+
     if (queryString == null) {
       return urlString;
     }
-    
+
     List<String> queryStringParts = Arrays.asList(queryString.split("&"));
     Collections.sort(queryStringParts);
-    
+
     StringBuilder sb = new StringBuilder();
-    
+
     sb.append(url.getProtocol());
     sb.append("://");
     sb.append(url.getHost());
@@ -82,7 +85,7 @@ public class QuerystringURLNormalizer im
       sb.append("#");
       sb.append(url.getRef());
     }
-    
+
     return sb.toString();
   }
 }

Modified: 
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java
 Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
  * by permutations.
  */
 package org.apache.nutch.net.urlnormalizer.querystring;
+

Modified: 
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
 Thu Jan 29 05:38:59 2015
@@ -31,12 +31,19 @@ public class TestQuerystringURLNormalize
 
     QuerystringURLNormalizer normalizer = new QuerystringURLNormalizer();
     normalizer.setConf(conf);
-    
-    assertEquals("http://example.com/?a=b&c=d";, 
normalizer.normalize("http://example.com/?c=d&a=b";, 
URLNormalizers.SCOPE_DEFAULT));
-    assertEquals("http://example.com/a/b/c";, 
normalizer.normalize("http://example.com/a/b/c";, URLNormalizers.SCOPE_DEFAULT));
-    assertEquals("http://example.com:1234/a/b/c";, 
normalizer.normalize("http://example.com:1234/a/b/c";, 
URLNormalizers.SCOPE_DEFAULT));
-    assertEquals("http://example.com:1234/a/b/c#ref";, 
normalizer.normalize("http://example.com:1234/a/b/c#ref";, 
URLNormalizers.SCOPE_DEFAULT));
-    assertEquals("http://example.com:1234/a/b/c?a=b&c=d#ref";, 
normalizer.normalize("http://example.com:1234/a/b/c?c=d&a=b#ref";, 
URLNormalizers.SCOPE_DEFAULT));
-    assertEquals("http://example.com/?a=b&a=c&c=d";, 
normalizer.normalize("http://example.com/?c=d&a=b&a=c";, 
URLNormalizers.SCOPE_DEFAULT));
+
+    assertEquals("http://example.com/?a=b&c=d";, normalizer.normalize(
+        "http://example.com/?c=d&a=b";, URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.com/a/b/c";, normalizer.normalize(
+        "http://example.com/a/b/c";, URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.com:1234/a/b/c";, normalizer.normalize(
+        "http://example.com:1234/a/b/c";, URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.com:1234/a/b/c#ref";, normalizer.normalize(
+        "http://example.com:1234/a/b/c#ref";, URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.com:1234/a/b/c?a=b&c=d#ref";,
+        normalizer.normalize("http://example.com:1234/a/b/c?c=d&a=b#ref";,
+            URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.com/?a=b&a=c&c=d";, normalizer.normalize(
+        "http://example.com/?c=d&a=b&a=c";, URLNormalizers.SCOPE_DEFAULT));
   }
 }

Modified: 
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 Thu Jan 29 05:38:59 2015
@@ -51,19 +51,23 @@ import org.xml.sax.InputSource;
  * Allows users to do regex substitutions on all/any URLs that are encountered,
  * which is useful for stripping session IDs from URLs.
  * 
- * <p>This class uses the <tt>urlnormalizer.regex.file</tt> property.
- * It should be set to the file name of an xml file which should contain the
- * patterns and substitutions to be done on encountered URLs.
+ * <p>
+ * This class uses the <tt>urlnormalizer.regex.file</tt> property. It should be
+ * set to the file name of an xml file which should contain the patterns and
+ * substitutions to be done on encountered URLs.
+ * </p>
+ * <p>
+ * This class also supports different rules depending on the scope. Please see
+ * the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more details.
  * </p>
- * <p>This class also supports different rules depending on the scope. Please 
see
- * the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more 
details.</p>
  * 
  * @author Luke Baker
  * @author Andrzej Bialecki
  */
 public class RegexURLNormalizer extends Configured implements URLNormalizer {
 
-  private static final Logger LOG = 
LoggerFactory.getLogger(RegexURLNormalizer.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(RegexURLNormalizer.class);
 
   /**
    * Class which holds a compiled pattern and its corresponding substition
@@ -75,19 +79,18 @@ public class RegexURLNormalizer extends
     public String substitution;
   }
 
-  private ThreadLocal<HashMap<String, List<Rule>>> scopedRulesThreadLocal = 
-      new ThreadLocal<HashMap<String,List<Rule>>>() {
-    protected java.util.HashMap<String,java.util.List<Rule>> initialValue() {
+  private ThreadLocal<HashMap<String, List<Rule>>> scopedRulesThreadLocal = 
new ThreadLocal<HashMap<String, List<Rule>>>() {
+    protected java.util.HashMap<String, java.util.List<Rule>> initialValue() {
       return new HashMap<String, List<Rule>>();
     };
   };
-  
+
   public HashMap<String, List<Rule>> getScopedRules() {
     return scopedRulesThreadLocal.get();
   }
-  
-  private List<Rule> defaultRules; 
-  
+
+  private List<Rule> defaultRules;
+
   private static final List<Rule> EMPTY_RULES = Collections.emptyList();
 
   /**
@@ -107,7 +110,7 @@ public class RegexURLNormalizer extends
    * configuration files for it.
    */
   public RegexURLNormalizer(Configuration conf, String filename)
-          throws IOException, PatternSyntaxException {
+      throws IOException, PatternSyntaxException {
     super(conf);
     List<Rule> rules = readConfigurationFile(filename);
     if (rules != null) {
@@ -117,7 +120,8 @@ public class RegexURLNormalizer extends
 
   public void setConf(Configuration conf) {
     super.setConf(conf);
-    if (conf == null) return;
+    if (conf == null)
+      return;
     // the default constructor was called
 
     String filename = getConf().get("urlnormalizer.regex.file");
@@ -147,9 +151,10 @@ public class RegexURLNormalizer extends
   void setConfiguration(Reader reader, String scope) {
     List<Rule> rules = readConfiguration(reader);
     getScopedRules().put(scope, rules);
-    LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " 
rules.");
+    LOG.debug("Set config for scope '" + scope + "': " + rules.size()
+        + " rules.");
   }
-  
+
   /**
    * This function does the replacements by iterating through all the regex
    * patterns. It accepts a string url as input and returns the altered string.
@@ -190,7 +195,7 @@ public class RegexURLNormalizer extends
   }
 
   public String normalize(String urlString, String scope)
-          throws MalformedURLException {
+      throws MalformedURLException {
     return regexNormalize(urlString, scope);
   }
 
@@ -207,17 +212,17 @@ public class RegexURLNormalizer extends
       return EMPTY_RULES;
     }
   }
-  
+
   private List<Rule> readConfiguration(Reader reader) {
     List<Rule> rules = new ArrayList<Rule>();
     try {
 
       // borrowed heavily from code in Configuration.java
       Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
-              .parse(new InputSource(reader));
+          .parse(new InputSource(reader));
       Element root = doc.getDocumentElement();
       if ((!"regex-normalize".equals(root.getTagName()))
-              && (LOG.isErrorEnabled())) {
+          && (LOG.isErrorEnabled())) {
         LOG.error("bad conf file: top-level element not <regex-normalize>");
       }
       NodeList regexes = root.getChildNodes();
@@ -240,7 +245,7 @@ public class RegexURLNormalizer extends
           if ("pattern".equals(field.getTagName()) && field.hasChildNodes())
             patternValue = ((Text) field.getFirstChild()).getData();
           if ("substitution".equals(field.getTagName())
-                  && field.hasChildNodes())
+              && field.hasChildNodes())
             subValue = ((Text) field.getFirstChild()).getData();
           if (!field.hasChildNodes())
             subValue = "";
@@ -251,7 +256,8 @@ public class RegexURLNormalizer extends
             rule.pattern = Pattern.compile(patternValue);
           } catch (PatternSyntaxException e) {
             if (LOG.isErrorEnabled()) {
-              LOG.error("skipped rule: " + patternValue + " -> " + subValue + 
" : invalid regular expression pattern: " + e);
+              LOG.error("skipped rule: " + patternValue + " -> " + subValue
+                  + " : invalid regular expression pattern: " + e);
             }
             continue;
           }
@@ -265,13 +271,14 @@ public class RegexURLNormalizer extends
       }
       return EMPTY_RULES;
     }
-    if (rules.size() == 0) return EMPTY_RULES;
+    if (rules.size() == 0)
+      return EMPTY_RULES;
     return rules;
   }
 
   /** Spits out patterns and substitutions that are in the configuration file. 
*/
   public static void main(String args[]) throws PatternSyntaxException,
-          IOException {
+      IOException {
     RegexURLNormalizer normalizer = new RegexURLNormalizer();
     normalizer.setConf(NutchConfiguration.create());
     HashMap<String, List<Rule>> scopedRules = normalizer.getScopedRules();
@@ -290,9 +297,10 @@ public class RegexURLNormalizer extends
       Iterator<String> it = scopedRules.keySet().iterator();
       while (it.hasNext()) {
         String scope = it.next();
-        if (URLNormalizers.SCOPE_DEFAULT.equals(scope)) continue;
+        if (URLNormalizers.SCOPE_DEFAULT.equals(scope))
+          continue;
         System.out.println("* Rules for '" + scope + "' scope:");
-        i = ((List<Rule>)scopedRules.get(scope)).iterator();
+        i = ((List<Rule>) scopedRules.get(scope)).iterator();
         while (i.hasNext()) {
           Rule r = (Rule) i.next();
           System.out.print("  " + r.pattern.pattern() + " -> ");
@@ -303,10 +311,12 @@ public class RegexURLNormalizer extends
     if (args.length > 0) {
       System.out.println("\n---------- Normalizer test -----------");
       String scope = URLNormalizers.SCOPE_DEFAULT;
-      if (args.length > 1) scope = args[1];
+      if (args.length > 1)
+        scope = args[1];
       System.out.println("Scope: " + scope);
       System.out.println("Input url:  '" + args[0] + "'");
-      System.out.println("Output url: '" + normalizer.normalize(args[0], 
scope) + "'");
+      System.out.println("Output url: '" + normalizer.normalize(args[0], scope)
+          + "'");
     }
     System.exit(0);
   }

Modified: 
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java
 Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
  * ({@link java.util.regex.Pattern}).
  */
 package org.apache.nutch.net.urlnormalizer.regex;
+

Modified: 
nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
 (original)
+++ 
nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
 Thu Jan 29 05:38:59 2015
@@ -36,24 +36,27 @@ import org.apache.nutch.util.NutchConfig
 
 /** Unit tests for RegexUrlNormalizer. */
 public class TestRegexURLNormalizer {
-  private static final Logger LOG = 
LoggerFactory.getLogger(TestRegexURLNormalizer.class);
-  
+  private static final Logger LOG = LoggerFactory
+      .getLogger(TestRegexURLNormalizer.class);
+
   private RegexURLNormalizer normalizer;
   private Configuration conf;
   private Map<String, NormalizedURL[]> testData = new HashMap<String, 
NormalizedURL[]>();
-  
+
   // This system property is defined in ./src/plugin/build-plugin.xml
   private String sampleDir = System.getProperty("test.data", ".");
+
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/urlnormalizer-regex/build.xml during plugin compilation.
-  
+
   public TestRegexURLNormalizer() throws IOException {
     normalizer = new RegexURLNormalizer();
     conf = NutchConfiguration.create();
     normalizer.setConf(conf);
     File[] configs = new File(sampleDir).listFiles(new FileFilter() {
       public boolean accept(File f) {
-        if (f.getName().endsWith(".xml") && 
f.getName().startsWith("regex-normalize-"))
+        if (f.getName().endsWith(".xml")
+            && f.getName().startsWith("regex-normalize-"))
           return true;
         return false;
       }
@@ -74,8 +77,8 @@ public class TestRegexURLNormalizer {
 
   @Test
   public void testNormalizerDefault() throws Exception {
-    normalizeTest((NormalizedURL[])testData.get(URLNormalizers.SCOPE_DEFAULT),
-            URLNormalizers.SCOPE_DEFAULT);
+    normalizeTest((NormalizedURL[]) testData.get(URLNormalizers.SCOPE_DEFAULT),
+        URLNormalizers.SCOPE_DEFAULT);
   }
 
   @Test
@@ -83,33 +86,36 @@ public class TestRegexURLNormalizer {
     Iterator<String> it = testData.keySet().iterator();
     while (it.hasNext()) {
       String scope = it.next();
-      normalizeTest((NormalizedURL[])testData.get(scope), scope);
+      normalizeTest((NormalizedURL[]) testData.get(scope), scope);
     }
   }
 
-  private void normalizeTest(NormalizedURL[] urls, String scope) throws 
Exception {
+  private void normalizeTest(NormalizedURL[] urls, String scope)
+      throws Exception {
     for (int i = 0; i < urls.length; i++) {
       String url = urls[i].url;
       String normalized = normalizer.normalize(urls[i].url, scope);
       String expected = urls[i].expectedURL;
-      LOG.info("scope: " + scope + " url: " + url + " | normalized: " + 
normalized + " | expected: " + expected);
+      LOG.info("scope: " + scope + " url: " + url + " | normalized: "
+          + normalized + " | expected: " + expected);
       Assert.assertEquals(urls[i].expectedURL, normalized);
     }
   }
-       
+
   private void bench(int loops, String scope) {
     long start = System.currentTimeMillis();
     try {
-      NormalizedURL[] expected = (NormalizedURL[])testData.get(scope);
-      if (expected == null) return;
+      NormalizedURL[] expected = (NormalizedURL[]) testData.get(scope);
+      if (expected == null)
+        return;
       for (int i = 0; i < loops; i++) {
         normalizeTest(expected, scope);
       }
     } catch (Exception e) {
       Assert.fail(e.toString());
     }
-    LOG.info("bench time (" + loops + ") " +
-             (System.currentTimeMillis() - start) + "ms");
+    LOG.info("bench time (" + loops + ") "
+        + (System.currentTimeMillis() - start) + "ms");
   }
 
   private static class NormalizedURL {
@@ -126,17 +132,18 @@ public class TestRegexURLNormalizer {
   private NormalizedURL[] readTestFile(String scope) throws IOException {
     File f = new File(sampleDir, "regex-normalize-" + scope + ".test");
     @SuppressWarnings("resource")
-    BufferedReader in = new BufferedReader(new InputStreamReader(new 
FileInputStream(f), "UTF-8"));
+    BufferedReader in = new BufferedReader(new InputStreamReader(
+        new FileInputStream(f), "UTF-8"));
     List<NormalizedURL> list = new ArrayList<NormalizedURL>();
     String line;
-    while((line = in.readLine()) != null) {
-      if (  line.trim().length() == 0 ||
-            line.startsWith("#") ||
-            line.startsWith(" ")) continue;
+    while ((line = in.readLine()) != null) {
+      if (line.trim().length() == 0 || line.startsWith("#")
+          || line.startsWith(" "))
+        continue;
       list.add(new NormalizedURL(line));
     }
     return (NormalizedURL[]) list.toArray(new NormalizedURL[list.size()]);
-  }  
+  }
 
   public static void main(String[] args) throws Exception {
     if (args.length == 0) {
@@ -150,7 +157,8 @@ public class TestRegexURLNormalizer {
       if (args[i].equals("-bench")) {
         bench = true;
         iter = Integer.parseInt(args[++i]);
-      } else scope = args[i];
+      } else
+        scope = args[i];
     }
     if (scope == null) {
       System.err.println("Missing required scope name.");
@@ -161,11 +169,12 @@ public class TestRegexURLNormalizer {
       System.exit(-1);
     }
     TestRegexURLNormalizer test = new TestRegexURLNormalizer();
-    NormalizedURL[] urls = (NormalizedURL[])test.testData.get(scope);
+    NormalizedURL[] urls = (NormalizedURL[]) test.testData.get(scope);
     if (urls == null) {
-      LOG.warn("Missing test data for scope '" + scope + "', using default 
scope.");
+      LOG.warn("Missing test data for scope '" + scope
+          + "', using default scope.");
       scope = URLNormalizers.SCOPE_DEFAULT;
-      urls = (NormalizedURL[])test.testData.get(scope);
+      urls = (NormalizedURL[]) test.testData.get(scope);
     }
     if (bench) {
       test.bench(iter, scope);
@@ -174,6 +183,4 @@ public class TestRegexURLNormalizer {
     }
   }
 
-
-
 }

Modified: 
nutch/trunk/src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java 
(original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java 
Thu Jan 29 05:38:59 2015
@@ -35,19 +35,26 @@ import org.slf4j.LoggerFactory;
 
 /**
  * Emulate a continuous crawl for one URL.
- *
+ * 
  */
 public class ContinuousCrawlTestUtil extends TestCase {
 
-  private static final Logger LOG = 
LoggerFactory.getLogger(ContinuousCrawlTestUtil.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(ContinuousCrawlTestUtil.class);
 
   protected static Text dummyURL = new Text("http://nutch.apache.org/";);
 
   protected static Configuration defaultConfig = CrawlDBTestUtil
       .createConfiguration();
 
-  protected long interval = FetchSchedule.SECONDS_PER_DAY*1000; // (default) 
launch crawler every day
-  protected long duration = 2*365L*FetchSchedule.SECONDS_PER_DAY*1000L; // run 
for two years
+  protected long interval = FetchSchedule.SECONDS_PER_DAY * 1000; // (default)
+                                                                  // launch
+                                                                  // crawler
+                                                                  // every day
+  protected long duration = 2 * 365L * FetchSchedule.SECONDS_PER_DAY * 1000L; 
// run
+                                                                              
// for
+                                                                              
// two
+                                                                              
// years
 
   protected Configuration configuration;
   private FetchSchedule schedule;
@@ -62,7 +69,7 @@ public class ContinuousCrawlTestUtil ext
   protected Content content = new Content();
 
   {
-    byte[] data = {'n', 'u', 't', 'c', 'h'};
+    byte[] data = { 'n', 'u', 't', 'c', 'h' };
     content.setContent(data);
   }
 
@@ -89,17 +96,17 @@ public class ContinuousCrawlTestUtil ext
 
   /** set the interval the crawl is relaunched (default: every day) */
   protected void setInterval(int seconds) {
-    interval = seconds*1000L;
+    interval = seconds * 1000L;
   }
 
   /** set the duration of the continuous crawl (default = 2 years) */
   protected void setDuraction(int seconds) {
-    duration = seconds*1000L;
+    duration = seconds * 1000L;
   }
 
   /**
    * default fetch action: set status and time
-   *
+   * 
    * @param datum
    *          CrawlDatum to fetch
    * @param currentTime
@@ -124,19 +131,20 @@ public class ContinuousCrawlTestUtil ext
    * change content to force a changed signature
    */
   protected void changeContent() {
-    byte [] data = Arrays.copyOf(content.getContent(), 
content.getContent().length+1);
+    byte[] data = Arrays.copyOf(content.getContent(),
+        content.getContent().length + 1);
     data[content.getContent().length] = '2'; // append one byte
     content.setContent(data);
     LOG.info("document content changed");
   }
 
-
   /**
    * default parse action: add signature if successfully fetched
-   *
+   * 
    * @param fetchDatum
    *          fetch datum
-   * @return list of all datums resulting from parse (status: signature, 
linked, parse_metadata)
+   * @return list of all datums resulting from parse (status: signature, 
linked,
+   *         parse_metadata)
    */
   protected List<CrawlDatum> parse(CrawlDatum fetchDatum) {
     List<CrawlDatum> parseDatums = new ArrayList<CrawlDatum>(0);
@@ -150,7 +158,7 @@ public class ContinuousCrawlTestUtil ext
 
   /**
    * default implementation to check the result state
-   *
+   * 
    * @param datum
    *          the CrawlDatum to be checked
    * @return true if the check succeeds
@@ -166,7 +174,7 @@ public class ContinuousCrawlTestUtil ext
    * <p>
    * A loop emulates a continuous crawl launched in regular intervals (see
    * {@link #setInterval(int)} over a longer period ({@link 
#setDuraction(int)}.
-   *
+   * 
    * <ul>
    * <li>every "round" emulates
    * <ul>
@@ -177,11 +185,11 @@ public class ContinuousCrawlTestUtil ext
    * <li>and is checked whether it is correct (see {@link #check(CrawlDatum)})
    * </ul>
    * </p>
-   *
+   * 
    * @param maxErrors
    *          (if > 0) continue crawl even if the checked CrawlDatum is not
    *          correct, but stop after max. number of errors
-   *
+   * 
    * @return false if a check of CrawlDatum failed, true otherwise
    */
   protected boolean run(int maxErrors) {
@@ -205,9 +213,11 @@ public class ContinuousCrawlTestUtil ext
     long lastFetchTime = -1;
     boolean ok = true; // record failure but keep going
     CrawlDatum fetchDatum = new CrawlDatum();
-    /* Keep copies because CrawlDbReducer.reduce()
-     * and FetchSchedule.shouldFetch() may alter the references.
-     * Copies are used for verbose logging in case of an error. */
+    /*
+     * Keep copies because CrawlDbReducer.reduce() and
+     * FetchSchedule.shouldFetch() may alter the references. Copies are used 
for
+     * verbose logging in case of an error.
+     */
     CrawlDatum copyDbDatum = new CrawlDatum();
     CrawlDatum copyFetchDatum = new CrawlDatum();
     CrawlDatum afterShouldFetch = new CrawlDatum();

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java Thu Jan 29 
05:38:59 2015
@@ -36,10 +36,10 @@ import org.mortbay.jetty.bio.SocketConne
 import org.mortbay.jetty.handler.ContextHandler;
 import org.mortbay.jetty.handler.ResourceHandler;
 
-
 public class CrawlDBTestUtil {
 
-  private static final Logger LOG = 
LoggerFactory.getLogger(CrawlDBTestUtil.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(CrawlDBTestUtil.class);
 
   /**
    * Creates synthetic crawldb
@@ -52,12 +52,12 @@ public class CrawlDBTestUtil {
    *          urls to be inserted, objects are of type URLCrawlDatum
    * @throws Exception
    */
-  public static void createCrawlDb(Configuration conf, FileSystem fs, Path 
crawldb, List<URLCrawlDatum> init)
-      throws Exception {
+  public static void createCrawlDb(Configuration conf, FileSystem fs,
+      Path crawldb, List<URLCrawlDatum> init) throws Exception {
     LOG.trace("* creating crawldb: " + crawldb);
     Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
-    MapFile.Writer writer = new MapFile.Writer(conf, fs, new Path(dir, 
"part-00000")
-        .toString(), Text.class, CrawlDatum.class);
+    MapFile.Writer writer = new MapFile.Writer(conf, fs, new Path(dir,
+        "part-00000").toString(), Text.class, CrawlDatum.class);
     Iterator<URLCrawlDatum> it = init.iterator();
     while (it.hasNext()) {
       URLCrawlDatum row = it.next();
@@ -69,25 +69,25 @@ public class CrawlDBTestUtil {
 
   /**
    * For now we need to manually construct our Configuration, because we need 
to
-   * override the default one and it is currently not possible to use 
dynamically
-   * set values.
+   * override the default one and it is currently not possible to use
+   * dynamically set values.
    * 
    * @return
    * @deprecated Use {@link #createConfiguration()} instead
    */
   @Deprecated
-  public static Configuration create(){
+  public static Configuration create() {
     return createConfiguration();
   }
 
   /**
    * For now we need to manually construct our Configuration, because we need 
to
-   * override the default one and it is currently not possible to use 
dynamically
-   * set values.
+   * override the default one and it is currently not possible to use
+   * dynamically set values.
    * 
    * @return
    */
-  public static Configuration createConfiguration(){
+  public static Configuration createConfiguration() {
     Configuration conf = new Configuration();
     conf.addResource("nutch-default.xml");
     conf.addResource("crawl-tests.xml");
@@ -108,32 +108,36 @@ public class CrawlDBTestUtil {
 
   /**
    * Generate seedlist
-   * @throws IOException 
+   * 
+   * @throws IOException
    */
-  public static void generateSeedList(FileSystem fs, Path urlPath, 
List<String> urls) throws IOException{
+  public static void generateSeedList(FileSystem fs, Path urlPath,
+      List<String> urls) throws IOException {
     generateSeedList(fs, urlPath, urls, new ArrayList<String>());
   }
-  
+
   /**
    * Generate seedlist
-   * @throws IOException 
+   * 
+   * @throws IOException
    */
-  public static void generateSeedList(FileSystem fs, Path urlPath, 
List<String> urls, List<String>metadata) throws IOException{
+  public static void generateSeedList(FileSystem fs, Path urlPath,
+      List<String> urls, List<String> metadata) throws IOException {
     FSDataOutputStream out;
-    Path file=new Path(urlPath,"urls.txt");
+    Path file = new Path(urlPath, "urls.txt");
     fs.mkdirs(urlPath);
-    out=fs.create(file);
-    
-    Iterator<String> urls_i=urls.iterator();
-    Iterator<String> metadata_i=metadata.iterator();
-    
+    out = fs.create(file);
+
+    Iterator<String> urls_i = urls.iterator();
+    Iterator<String> metadata_i = metadata.iterator();
+
     String url;
     String md;
-    while(urls_i.hasNext()){
-      url=urls_i.next();
+    while (urls_i.hasNext()) {
+      url = urls_i.next();
 
       out.writeBytes(url);
-            
+
       if (metadata_i.hasNext()) {
         md = metadata_i.next();
         out.writeBytes(md);
@@ -141,19 +145,22 @@ public class CrawlDBTestUtil {
 
       out.writeBytes("\n");
     }
-    
+
     out.flush();
     out.close();
   }
-  
+
   /**
    * Creates a new JettyServer with one static root context
    * 
-   * @param port port to listen to
-   * @param staticContent folder where static content lives
-   * @throws UnknownHostException 
+   * @param port
+   *          port to listen to
+   * @param staticContent
+   *          folder where static content lives
+   * @throws UnknownHostException
    */
-  public static Server getServer(int port, String staticContent) throws 
UnknownHostException{
+  public static Server getServer(int port, String staticContent)
+      throws UnknownHostException {
     Server webServer = new org.mortbay.jetty.Server();
     SocketConnector listener = new SocketConnector();
     listener.setPort(port);

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java 
(original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java Thu Jan 
29 05:38:59 2015
@@ -45,7 +45,8 @@ import org.slf4j.LoggerFactory;
  */
 public class CrawlDbUpdateUtil<T extends Reducer<Text, CrawlDatum, Text, 
CrawlDatum>> {
 
-  private static final Logger LOG = 
LoggerFactory.getLogger(CrawlDbUpdateUtil.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(CrawlDbUpdateUtil.class);
 
   private T reducer;
 
@@ -74,9 +75,8 @@ public class CrawlDbUpdateUtil<T extends
   }
 
   /**
-   * Dummy reporter which does nothing and does not return null for
-   * getCounter()
-   *
+   * Dummy reporter which does nothing and does not return null for 
getCounter()
+   * 
    * @see {@link Reporter#NULL}
    */
   private class DummyReporter implements Reporter {
@@ -117,8 +117,10 @@ public class CrawlDbUpdateUtil<T extends
    * run
    * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
    * and return the CrawlDatum(s) which would have been written into CrawlDb
-   * @param values  list of input CrawlDatums
-   * @return  list of resulting CrawlDatum(s) in CrawlDb
+   * 
+   * @param values
+   *          list of input CrawlDatums
+   * @return list of resulting CrawlDatum(s) in CrawlDb
    */
   public List<CrawlDatum> update(List<CrawlDatum> values) {
     if (values == null || values.size() == 0) {
@@ -138,12 +140,14 @@ public class CrawlDbUpdateUtil<T extends
    * run
    * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
    * and return the CrawlDatum(s) which would have been written into CrawlDb
-   * @param dbDatum  previous CrawlDatum in CrawlDb
-   * @param fetchDatum  CrawlDatum resulting from fetching
-   * @return  list of resulting CrawlDatum(s) in CrawlDb
+   * 
+   * @param dbDatum
+   *          previous CrawlDatum in CrawlDb
+   * @param fetchDatum
+   *          CrawlDatum resulting from fetching
+   * @return list of resulting CrawlDatum(s) in CrawlDb
    */
-  public List<CrawlDatum> update(CrawlDatum dbDatum,
-      CrawlDatum fetchDatum) {
+  public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) {
     List<CrawlDatum> values = new ArrayList<CrawlDatum>();
     if (dbDatum != null)
       values.add(dbDatum);

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/DummyWritable.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/DummyWritable.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/DummyWritable.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/DummyWritable.java Thu Jan 29 
05:38:59 2015
@@ -21,12 +21,12 @@ import org.apache.hadoop.io.IntWritable;
 
 public class DummyWritable extends IntWritable {
 
-    public DummyWritable() {
+  public DummyWritable() {
 
-    }
+  }
 
-    public DummyWritable(int i) {
-        super(i);
-    }
+  public DummyWritable(int i) {
+    super(i);
+  }
 
 }

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java 
(original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java Thu 
Jan 29 05:38:59 2015
@@ -13,11 +13,12 @@ import org.slf4j.LoggerFactory;
 
 public class TODOTestCrawlDbStates extends TestCrawlDbStates {
 
-  private static final Logger LOG = 
LoggerFactory.getLogger(TODOTestCrawlDbStates.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(TODOTestCrawlDbStates.class);
 
   /**
-   * NUTCH-578: a fetch_retry should result in a db_gone if db.fetch.retry.max 
is reached.
-   * Retry counter has to be reset appropriately.
+   * NUTCH-578: a fetch_retry should result in a db_gone if db.fetch.retry.max
+   * is reached. Retry counter has to be reset appropriately.
    */
   @Test
   public void testCrawlDbReducerPageRetrySchedule() {
@@ -86,8 +87,7 @@ public class TODOTestCrawlDbStates exten
    * <p>
    * Problem: documents not modified for a longer time are fetched in every
    * cycle because of an error in the SYNC_DELTA calculation of
-   * {@link AdaptiveFetchSchedule}.
-   * <br>
+   * {@link AdaptiveFetchSchedule}. <br>
    * The next fetch time should always be in the future, never in the past.
    * </p>
    */
@@ -95,15 +95,15 @@ public class TODOTestCrawlDbStates exten
   public void testAdaptiveFetchScheduleSyncDelta() {
     LOG.info("NUTCH-1564 test SYNC_DELTA calculation of 
AdaptiveFetchSchedule");
     Configuration conf = CrawlDBTestUtil.createConfiguration();
-    conf.setLong("db.fetch.interval.default",               172800); // 2 days
-    conf.setLong("db.fetch.schedule.adaptive.min_interval",  86400); // 1 day
+    conf.setLong("db.fetch.interval.default", 172800); // 2 days
+    conf.setLong("db.fetch.schedule.adaptive.min_interval", 86400); // 1 day
     conf.setLong("db.fetch.schedule.adaptive.max_interval", 604800); // 7 days
-    conf.setLong("db.fetch.interval.max",                   604800); // 7 days
+    conf.setLong("db.fetch.interval.max", 604800); // 7 days
     conf.set("db.fetch.schedule.class",
         "org.apache.nutch.crawl.AdaptiveFetchSchedule");
     ContinuousCrawlTestUtil crawlUtil = new 
CrawlTestFetchScheduleNotModifiedFetchTime(
         conf);
-    crawlUtil.setInterval(FetchSchedule.SECONDS_PER_DAY/3);
+    crawlUtil.setInterval(FetchSchedule.SECONDS_PER_DAY / 3);
     if (!crawlUtil.run(100)) {
       fail("failed: sync_delta calculation with AdaptiveFetchSchedule");
     }
@@ -150,10 +150,10 @@ public class TODOTestCrawlDbStates exten
           // next fetch time is in less than one minute
           // (critical: Nutch can hardly be so fast)
           LOG.error("Less then one minute until next fetch: " + result);
-       }
+        }
         // Next fetch time should be within min. and max. (tolerance: 60 sec.)
-        if (secondsUntilNextFetch+60 < minInterval
-            || secondsUntilNextFetch-60 > maxInterval) {
+        if (secondsUntilNextFetch + 60 < minInterval
+            || secondsUntilNextFetch - 60 > maxInterval) {
           LOG.error("Interval until next fetch time ("
               + TimingUtil.elapsedTime(fetchTime, result.getFetchTime())
               + ") is not within min. and max. interval: " + result);

Modified: 
nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java 
(original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java 
Thu Jan 29 05:38:59 2015
@@ -61,19 +61,19 @@ public class TestAdaptiveFetchSchedule e
     Text url = new Text("http://www.example.com";);
 
     changed = FetchSchedule.STATUS_UNKNOWN;
-    fs.setFetchSchedule(url, p, p.getFetchTime(),
-        p.getModifiedTime(), curTime, lastModified, changed);
+    fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime,
+        lastModified, changed);
     validateFetchInterval(changed, p.getFetchInterval());
 
     changed = FetchSchedule.STATUS_MODIFIED;
-    fs.setFetchSchedule(url, p, p.getFetchTime(),
-        p.getModifiedTime(), curTime, lastModified, changed);
+    fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime,
+        lastModified, changed);
     validateFetchInterval(changed, p.getFetchInterval());
     p.setFetchInterval(interval);
 
     changed = FetchSchedule.STATUS_NOTMODIFIED;
-    fs.setFetchSchedule(url, p, p.getFetchTime(),
-        p.getModifiedTime(), curTime, lastModified, changed);
+    fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime,
+        lastModified, changed);
     validateFetchInterval(changed, p.getFetchInterval());
 
   }

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java 
(original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java Thu Jan 
29 05:38:59 2015
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
- 
+
 package org.apache.nutch.crawl;
 
 import java.io.IOException;
@@ -34,9 +34,9 @@ import org.junit.Before;
 import org.junit.Test;
 
 /**
- * CrawlDbFiltering test which tests for correct, error free url 
- * normalization when the CrawlDB includes urls with <code>DB GONE</code> 
status 
- * and <code>CRAWLDB_PURGE_404</code> is set to true.
+ * CrawlDbFiltering test which tests for correct, error free url normalization
+ * when the CrawlDB includes urls with <code>DB GONE</code> status and
+ * <code>CRAWLDB_PURGE_404</code> is set to true.
  * 
  * @author lufeng
  */
@@ -68,27 +68,27 @@ public class TestCrawlDbFilter {
 
   /**
    * Test url404Purging
-   *
+   * 
    * @throws Exception
    */
   @Test
   public void testUrl404Purging() throws Exception {
     // create a CrawlDatum with DB GONE status
     ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
-    list.add(new URLCrawlDatum(new Text("http://www.example.com";), new 
CrawlDatum(
-      CrawlDatum.STATUS_DB_GONE, 0, 0.0f)));
-    list.add(new URLCrawlDatum(new Text("http://www.example1.com";), new 
CrawlDatum(
-      CrawlDatum.STATUS_DB_FETCHED, 0, 0.0f)));
-    list.add(new URLCrawlDatum(new Text("http://www.example2.com";), new 
CrawlDatum(
-      CrawlDatum.STATUS_DB_UNFETCHED, 0, 0.0f)));
+    list.add(new URLCrawlDatum(new Text("http://www.example.com";),
+        new CrawlDatum(CrawlDatum.STATUS_DB_GONE, 0, 0.0f)));
+    list.add(new URLCrawlDatum(new Text("http://www.example1.com";),
+        new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 0, 0.0f)));
+    list.add(new URLCrawlDatum(new Text("http://www.example2.com";),
+        new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 0, 0.0f)));
     dbDir = new Path(testdir, "crawldb");
-    newCrawlDb = new Path(testdir,"newcrawldb");
+    newCrawlDb = new Path(testdir, "newcrawldb");
     // create crawldb
     CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
     // set CRAWLDB_PURGE_404 to true
-    conf.setBoolean(CrawlDb.CRAWLDB_PURGE_404,true);
-    conf.setBoolean(CrawlDbFilter.URL_NORMALIZING,true);
-    conf.setBoolean(CrawlDbFilter.URL_FILTERING,false);
+    conf.setBoolean(CrawlDb.CRAWLDB_PURGE_404, true);
+    conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, true);
+    conf.setBoolean(CrawlDbFilter.URL_FILTERING, false);
     conf.setInt("urlnormalizer.loop.count", 2);
     JobConf job = new NutchJob(conf);
     job.setJobName("Test CrawlDbFilter");
@@ -105,8 +105,7 @@ public class TestCrawlDbFilter {
     job.setOutputValueClass(CrawlDatum.class);
     JobClient.runJob(job);
 
-    Path fetchlist = new Path(new Path(newCrawlDb,
-      "part-00000"), "data");
+    Path fetchlist = new Path(new Path(newCrawlDb, "part-00000"), "data");
 
     ArrayList<URLCrawlDatum> l = readContents(fetchlist);
 
@@ -116,11 +115,14 @@ public class TestCrawlDbFilter {
 
   /**
    * Read contents of fetchlist.
-   * @param fetchlist  path to Generated fetchlist
+   * 
+   * @param fetchlist
+   *          path to Generated fetchlist
    * @return Generated {@link URLCrawlDatum} objects
    * @throws IOException
    */
-  private ArrayList<URLCrawlDatum> readContents(Path fetchlist) throws 
IOException {
+  private ArrayList<URLCrawlDatum> readContents(Path fetchlist)
+      throws IOException {
     // verify results
     SequenceFile.Reader reader = new SequenceFile.Reader(fs, fetchlist, conf);
 

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java 
(original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java Thu Jan 
29 05:38:59 2015
@@ -34,18 +34,15 @@ import org.junit.Before;
 import org.junit.Test;
 
 public class TestCrawlDbMerger {
-  private static final Logger LOG = 
Logger.getLogger(CrawlDbMerger.class.getName());
-  
+  private static final Logger LOG = Logger.getLogger(CrawlDbMerger.class
+      .getName());
+
   String url10 = "http://example.com/";;
   String url11 = "http://example.com/foo";;
   String url20 = "http://example.com/";;
   String url21 = "http://example.com/bar";;
-  String[] urls_expected = new String[] {
-          url10,
-          url11,
-          url21
-  };
-  
+  String[] urls_expected = new String[] { url10, url11, url21 };
+
   TreeSet<String> init1 = new TreeSet<String>();
   TreeSet<String> init2 = new TreeSet<String>();
   HashMap<String, CrawlDatum> expected = new HashMap<String, CrawlDatum>();
@@ -54,7 +51,7 @@ public class TestCrawlDbMerger {
   FileSystem fs;
   Path testDir;
   CrawlDbReader reader;
-  
+
   @Before
   public void setUp() throws Exception {
     init1.add(url10);
@@ -81,20 +78,21 @@ public class TestCrawlDbMerger {
     expected.put(url21, cd2);
     conf = NutchConfiguration.create();
     fs = FileSystem.get(conf);
-    testDir = new Path("test-crawldb-" +
-            new java.util.Random().nextInt());
+    testDir = new Path("test-crawldb-" + new java.util.Random().nextInt());
     fs.mkdirs(testDir);
   }
-  
+
   @After
   public void tearDown() {
     try {
       if (fs.exists(testDir))
         fs.delete(testDir, true);
-    } catch (Exception e) { }
+    } catch (Exception e) {
+    }
     try {
       reader.close();
-    } catch (Exception e) { }
+    } catch (Exception e) {
+    }
   }
 
   @Test
@@ -106,7 +104,7 @@ public class TestCrawlDbMerger {
     createCrawlDb(conf, fs, crawldb2, init2, cd2);
     CrawlDbMerger merger = new CrawlDbMerger(conf);
     LOG.fine("* merging crawldbs to " + output);
-    merger.merge(output, new Path[]{crawldb1, crawldb2}, false, false);
+    merger.merge(output, new Path[] { crawldb1, crawldb2 }, false, false);
     LOG.fine("* reading crawldb: " + output);
     reader = new CrawlDbReader();
     String crawlDb = output.toString();
@@ -127,11 +125,13 @@ public class TestCrawlDbMerger {
     reader.close();
     fs.delete(testDir, true);
   }
-  
-  private void createCrawlDb(Configuration config, FileSystem fs, Path 
crawldb, TreeSet<String> init, CrawlDatum cd) throws Exception {
+
+  private void createCrawlDb(Configuration config, FileSystem fs, Path crawldb,
+      TreeSet<String> init, CrawlDatum cd) throws Exception {
     LOG.fine("* creating crawldb: " + crawldb);
     Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
-    MapFile.Writer writer = new MapFile.Writer(config, fs, new Path(dir, 
"part-00000").toString(), Text.class, CrawlDatum.class);
+    MapFile.Writer writer = new MapFile.Writer(config, fs, new Path(dir,
+        "part-00000").toString(), Text.class, CrawlDatum.class);
     Iterator<String> it = init.iterator();
     while (it.hasNext()) {
       String key = it.next();

svn commit: r1655526 [23/26] - in /nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/p...

Reply via email to