Author: markus
Date: Mon Nov 22 14:56:40 2010
New Revision: 1037742
URL: http://svn.apache.org/viewvc?rev=1037742&view=rev
Log:
NUTCH-935 - remove unnecessary /./ in basic urlnormalizer (via Stondubleyt)
Modified:
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
Modified:
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1037742&r1=1037741&r2=1037742&view=diff
==============================================================================
---
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
(original)
+++
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
Mon Nov 22 14:56:40 2010
@@ -43,6 +43,7 @@ public class BasicURLNormalizer implemen
};
private Rule relativePathRule = null;
private Rule leadingRelativePathRule = null;
+ private Rule currentPathRule = null;
private Rule adjacentSlashRule = null;
private Configuration conf;
@@ -65,6 +66,13 @@ public class BasicURLNormalizer implemen
compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
leadingRelativePathRule.substitution = new Perl5Substitution("/");
+ // this pattern tries to find spots like "/./" in the url,
+ // which could be replaced by "/"
+ currentPathRule = new Rule();
+ currentPathRule.pattern = (Perl5Pattern)
+ compiler.compile("(/\\./)", Perl5Compiler.READ_ONLY_MASK);
+ currentPathRule.substitution = new Perl5Substitution("/");
+
// this pattern tries to find spots like "xx//yy" in the url,
// which could be replaced by a "/"
adjacentSlashRule = new Rule();
@@ -171,6 +179,11 @@ public class BasicURLNormalizer implemen
fileWorkCopy = Util.substitute
(matcher, leadingRelativePathRule.pattern,
leadingRelativePathRule.substitution, fileWorkCopy, 1);
+
+ // remove unnecessary "/./"
+ fileWorkCopy = Util.substitute
+ (matcher, currentPathRule.pattern,
+ currentPathRule.substitution, fileWorkCopy, 1);
// collapse adjacent slashes with "/"
Modified:
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java?rev=1037742&r1=1037741&r2=1037742&view=diff
==============================================================================
---
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
(original)
+++
nutch/branches/branch-1.3/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
Mon Nov 22 14:56:40 2010
@@ -60,6 +60,9 @@ public class TestBasicURLNormalizer exte
// normalizeTest("http://foo.com/%66oo.html",
"http://foo.com/foo.html");
// check that unnecessary "../" are removed
+
+ normalizeTest("http://foo.com/aa/./foo.html",
+ "http://foo.com/aa/foo.html" );
normalizeTest("http://foo.com/aa/../",
"http://foo.com/" );
normalizeTest("http://foo.com/aa/bb/../",
@@ -112,4 +115,5 @@ public class TestBasicURLNormalizer exte
-}
+
+}
\ No newline at end of file