Author: kwright
Date: Fri Oct 16 10:25:18 2020
New Revision: 1882582

URL: http://svn.apache.org/viewvc?rev=1882582&view=rev
Log:
Fix for CONNECTORS-1655.

Modified:
    manifoldcf/trunk/CHANGES.txt
    
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1882582&r1=1882581&r2=1882582&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Oct 16 10:25:18 2020
@@ -3,6 +3,8 @@ $Id$
 
 ======================= 2.18-dev =====================
 
+CONNECTORS-1655: Handle some forms of illegal content type.
+(Karl Wright)
 
 ======================= Release 2.17 =====================
 

Modified: 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1882582&r1=1882581&r2=1882582&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
 (original)
+++ 
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
 Fri Oct 16 10:25:18 2020
@@ -1497,8 +1497,15 @@ public class WebcrawlerConnector extends
       return null;
     String suffix = contentType.substring(semiIndex+1);
     suffix = suffix.trim();
-    if (suffix.startsWith("charset="))
-      return suffix.substring("charset=".length());
+    if (suffix.startsWith("charset=")) {
+      String trialSuffix = suffix.substring("charset=".length());
+      int semi = trialSuffix.indexOf(";");
+      if (semi == -1) {
+        return trialSuffix;
+      }
+      // Strip off additional crap some websites now add
+      return trialSuffix.substring(0, semi).trim();
+    }
     return null;
   }
   


Reply via email to