Author: kwright
Date: Fri Oct 16 10:25:18 2020
New Revision: 1882582
URL: http://svn.apache.org/viewvc?rev=1882582&view=rev
Log:
Fix for CONNECTORS-1655.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Modified: manifoldcf/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1882582&r1=1882581&r2=1882582&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Oct 16 10:25:18 2020
@@ -3,6 +3,8 @@ $Id$
======================= 2.18-dev =====================
+CONNECTORS-1655: Handle some forms of illegal content type.
+(Karl Wright)
======================= Release 2.17 =====================
Modified:
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1882582&r1=1882581&r2=1882582&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
(original)
+++
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Fri Oct 16 10:25:18 2020
@@ -1497,8 +1497,15 @@ public class WebcrawlerConnector extends
return null;
String suffix = contentType.substring(semiIndex+1);
suffix = suffix.trim();
- if (suffix.startsWith("charset="))
- return suffix.substring("charset=".length());
+ if (suffix.startsWith("charset=")) {
+ String trialSuffix = suffix.substring("charset=".length());
+ int semi = trialSuffix.indexOf(";");
+ if (semi == -1) {
+ return trialSuffix;
+ }
+ // Strip off additional crap some websites now add
+ return trialSuffix.substring(0, semi).trim();
+ }
return null;
}