Author: markus
Date: Mon Nov 22 14:56:40 2010
New Revision: 1037742

URL: http://svn.apache.org/viewvc?rev=1037742&view=rev
Log:
NUTCH-935 - remove unnecessary /./ in basic urlnormalizer (via Stondubleyt)

Modified:
    
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
    
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java

Modified: 
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1037742&r1=1037741&r2=1037742&view=diff
==============================================================================
--- 
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 (original)
+++ 
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 Mon Nov 22 14:56:40 2010
@@ -43,6 +43,7 @@ public class BasicURLNormalizer implemen
       };
     private Rule relativePathRule = null;
     private Rule leadingRelativePathRule = null;
+    private Rule currentPathRule = null;
     private Rule adjacentSlashRule = null;
 
     private Configuration conf;
@@ -65,6 +66,13 @@ public class BasicURLNormalizer implemen
           compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
         leadingRelativePathRule.substitution = new Perl5Substitution("/");
 
+        // this pattern tries to find spots like "/./" in the url,
+        // which could be replaced by "/"
+        currentPathRule = new Rule();
+        currentPathRule.pattern = (Perl5Pattern)
+          compiler.compile("(/\\./)", Perl5Compiler.READ_ONLY_MASK);
+        currentPathRule.substitution = new Perl5Substitution("/");
+
         // this pattern tries to find spots like "xx//yy" in the url,
         // which could be replaced by a "/"
         adjacentSlashRule = new Rule();
@@ -171,6 +179,11 @@ public class BasicURLNormalizer implemen
             fileWorkCopy = Util.substitute
               (matcher, leadingRelativePathRule.pattern,
                leadingRelativePathRule.substitution, fileWorkCopy, 1);
+
+            // remove unnecessary "/./"
+            fileWorkCopy = Util.substitute
+            (matcher, currentPathRule.pattern,
+                       currentPathRule.substitution, fileWorkCopy, 1);
             
             
             // collapse adjacent slashes with "/"

Modified: 
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java?rev=1037742&r1=1037741&r2=1037742&view=diff
==============================================================================
--- 
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
 (original)
+++ 
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
 Mon Nov 22 14:56:40 2010
@@ -60,6 +60,9 @@ public class TestBasicURLNormalizer exte
     //     normalizeTest("http://foo.com/%66oo.html";, 
"http://foo.com/foo.html";);
 
     // check that unnecessary "../" are removed
+
+    normalizeTest("http://foo.com/aa/./foo.html";,
+                  "http://foo.com/aa/foo.html"; );
     normalizeTest("http://foo.com/aa/../";,
                   "http://foo.com/"; );
     normalizeTest("http://foo.com/aa/bb/../";,
@@ -112,4 +115,5 @@ public class TestBasicURLNormalizer exte
 
 
 
-}
+
+}
\ No newline at end of file


Reply via email to