Author: kwright
Date: Fri Aug 23 11:44:43 2019
New Revision: 1865744
URL: http://svn.apache.org/viewvc?rev=1865744&view=rev
Log:
Fix for CONNECTORS-1621.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
Modified: manifoldcf/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1865744&r1=1865743&r2=1865744&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Aug 23 11:44:43 2019
@@ -3,6 +3,9 @@ $Id$
======================= 2.14-dev =====================
+CONNECTORS-1621: Fix broken ability to index Tika-extracted documents in Solr.
+(Markus Schuch, Karl Wright)
+
CONNECTORS-1620: Add ability to extract links from application/xml documents.
(Markus Schuch)
Modified:
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java?rev=1865744&r1=1865743&r2=1865744&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
(original)
+++
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
Fri Aug 23 11:44:43 2019
@@ -578,7 +578,9 @@ public class HttpPoster
}
// If not the right mime type, reject it.
- if ((includedMimeTypes !=null || excludedMimeTypes != null) &&
!checkMimeTypeIndexable(document.getMimeType(), useExtractUpdateHandler,
includedMimeTypes, excludedMimeTypes)) {
+ // Note: this code added as part of CONNECTORS-1482 was incorrect!
Document filtering specified in the solr connector is always against the
+ // ORIGINAL mime type (which is what's in the document). This why the
checkMimeTypeIndexable second argument is always "true".
+ if ((includedMimeTypes !=null || excludedMimeTypes != null) &&
!checkMimeTypeIndexable(document.getMimeType(), true, includedMimeTypes,
excludedMimeTypes)) {
activities.recordActivity(null,SolrConnector.INGEST_ACTIVITY,null,documentURI,activities.EXCLUDED_MIMETYPE,"Solr
connector rejected document due to mime type restrictions:
("+document.getMimeType()+")");
return false;
}
@@ -812,9 +814,17 @@ public class HttpPoster
final String lowerMimeType =
(mimeType==null)?null:mimeType.toLowerCase(Locale.ROOT);
if (useExtractUpdateHandler)
{
- if (includedMimeTypes != null &&
!includedMimeTypes.contains(lowerMimeType))
+ // Strip the charset off for this check
+ int index = lowerMimeType == null ? -1 : lowerMimeType.indexOf(";");
+ final String checkMimeType;
+ if (index != -1) {
+ checkMimeType = lowerMimeType.substring(0,index);
+ } else {
+ checkMimeType = lowerMimeType;
+ }
+ if (includedMimeTypes != null &&
!includedMimeTypes.contains(checkMimeType))
return false;
- if (excludedMimeTypes != null &&
excludedMimeTypes.contains(lowerMimeType))
+ if (excludedMimeTypes != null &&
excludedMimeTypes.contains(checkMimeType))
return false;
return true;
}