Author: markus
Date: Wed Jul  6 15:35:51 2011
New Revision: 1143468

URL: http://svn.apache.org/viewvc?rev=1143468&view=rev
Log:
NUTCH-1011 Remove duplicate slashes from URLs

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/regex-normalize.xml.template
    nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1143468&r1=1143467&r2=1143468&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul  6 15:35:51 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.0 - Current Development
 
+* NUTCH-1011 Normalize duplicate slashes in URL's (markus)
+
 * NUTCH-1013 Migrate RegexURLNormalizer from Apache ORO to java.util.regex 
(markus)
 
 * NUTCH-1016 Strip UTF-8 non-character codepoints and add logging for 
SolrWriter (markus)

Modified: nutch/trunk/conf/regex-normalize.xml.template
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/regex-normalize.xml.template?rev=1143468&r1=1143467&r2=1143468&view=diff
==============================================================================
--- nutch/trunk/conf/regex-normalize.xml.template (original)
+++ nutch/trunk/conf/regex-normalize.xml.template Wed Jul  6 15:35:51 2011
@@ -63,4 +63,10 @@
   <substitution></substitution>
 </regex>
 
+<!-- removes duplicate slashes -->
+<regex>
+  <pattern>(?&lt;!:)/{2,}</pattern>
+  <substitution>/</substitution>
+</regex>
+
 </regex-normalize>

Modified: nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java?rev=1143468&r1=1143467&r2=1143468&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java Wed Jul  
6 15:35:51 2011
@@ -39,6 +39,15 @@ public class TestURLNormalizers extends 
     } catch (MalformedURLException mue) {
       fail(mue.toString());
     }
+
+    // NUTCH-1011 - Get rid of superfluous slashes
+    try {
+      String normalizedSlashes = 
normalizers.normalize("http://www.example.org//path/to//somewhere.html";, 
URLNormalizers.SCOPE_DEFAULT);
+      assertEquals(normalizedSlashes, 
"http://www.example.org/path/to/somewhere.html";);
+    } catch (MalformedURLException mue) {
+      fail(mue.toString());
+    }
+
     // check the order
     int pos1 = -1, pos2 = -1;
     URLNormalizer[] impls = 
normalizers.getURLNormalizers(URLNormalizers.SCOPE_DEFAULT);


Reply via email to