Author: markus
Date: Wed Jul 6 15:35:51 2011
New Revision: 1143468
URL: http://svn.apache.org/viewvc?rev=1143468&view=rev
Log:
NUTCH-1011 Remove duplicate slashes from URLs
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/regex-normalize.xml.template
nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1143468&r1=1143467&r2=1143468&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul 6 15:35:51 2011
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.0 - Current Development
+* NUTCH-1011 Normalize duplicate slashes in URL's (markus)
+
* NUTCH-1013 Migrate RegexURLNormalizer from Apache ORO to java.util.regex
(markus)
* NUTCH-1016 Strip UTF-8 non-character codepoints and add logging for
SolrWriter (markus)
Modified: nutch/trunk/conf/regex-normalize.xml.template
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/regex-normalize.xml.template?rev=1143468&r1=1143467&r2=1143468&view=diff
==============================================================================
--- nutch/trunk/conf/regex-normalize.xml.template (original)
+++ nutch/trunk/conf/regex-normalize.xml.template Wed Jul 6 15:35:51 2011
@@ -63,4 +63,10 @@
<substitution></substitution>
</regex>
+<!-- removes duplicate slashes -->
+<regex>
+ <pattern>(?<!:)/{2,}</pattern>
+ <substitution>/</substitution>
+</regex>
+
</regex-normalize>
Modified: nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java?rev=1143468&r1=1143467&r2=1143468&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java Wed Jul
6 15:35:51 2011
@@ -39,6 +39,15 @@ public class TestURLNormalizers extends
} catch (MalformedURLException mue) {
fail(mue.toString());
}
+
+ // NUTCH-1011 - Get rid of superfluous slashes
+ try {
+ String normalizedSlashes =
normalizers.normalize("http://www.example.org//path/to//somewhere.html",
URLNormalizers.SCOPE_DEFAULT);
+ assertEquals(normalizedSlashes,
"http://www.example.org/path/to/somewhere.html");
+ } catch (MalformedURLException mue) {
+ fail(mue.toString());
+ }
+
// check the order
int pos1 = -1, pos2 = -1;
URLNormalizer[] impls =
normalizers.getURLNormalizers(URLNormalizers.SCOPE_DEFAULT);