Author: kwright
Date: Wed Sep  5 16:42:37 2018
New Revision: 1840145

URL: http://svn.apache.org/viewvc?rev=1840145&view=rev
Log:
CONNECTORS-1528: Strip out duplicate slashes from path

Modified:
    
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1840145&r1=1840144&r2=1840145&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
 (original)
+++ 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
 Wed Sep  5 16:42:37 2018
@@ -3670,6 +3670,12 @@ public class WebcrawlerConnector extends
 
     }
 
+    // Remove duplicate path slashes.  This is gated by the "lowercase" 
selection, since it's also an IIS-specific problem.
+    if (p != null && p.canLowercase())
+    {
+      pathString = filterMultipleSlashes(pathString);
+    }
+    
     // Put it back into the URL without the ref, and with the modified query 
and path parts.
     url = new 
WebURL(url.getScheme(),url.getHost(),url.getPort(),pathString,queryString);
     String rval = url.toASCIIString();
@@ -3681,6 +3687,19 @@ public class WebcrawlerConnector extends
     return rval;
   }
 
+  private static String filterMultipleSlashes(String pathString) {
+    // Not terribly efficient unless there are almost never duplicate slashes
+    while (true)
+    {
+      final int index = pathString.indexOf("//");
+      if (index == -1)
+      {
+        return pathString;
+      }
+      pathString = pathString.substring(0, index) + pathString.substring(index 
+ 1);
+    }
+  }
+  
   /** Code to check if data is interesting, based on response code and content 
type.
   */
   protected boolean isContentInteresting(IFingerprintActivity activities, 
String documentIdentifier, int response, String contentType)


Reply via email to