Author: jmssiera
Date: Mon Sep 26 09:11:04 2022
New Revision: 1904264
URL: http://svn.apache.org/viewvc?rev=1904264&view=rev
Log:
CONNECTORS-1733: TikaServiceRmeta does not properly handle unknown tika
exceptions
Modified:
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java
Modified:
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java?rev=1904264&r1=1904263&r2=1904264&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java
(original)
+++
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java
Mon Sep 26 09:11:04 2022
@@ -695,6 +695,7 @@ public class TikaExtractor extends org.a
try {
final Map<String, List<String>> metadata = new HashMap<>();
+ final Map<String, List<String>> embeddedResourcesMetadata = new
HashMap<>();
if (document.getFileName() != null) {
metadata.put(TikaMetadataKeys.RESOURCE_NAME_KEY, new ArrayList<>());
metadata.put("stream_name", new ArrayList<>());
@@ -792,7 +793,6 @@ public class TikaExtractor extends org.a
if (token != null) {
while ((token = jParser.nextToken()) != null && token !=
JsonToken.END_OBJECT) {
-
final int fieldNameLength = jParser.getTextLength();
if (fieldNameLength <= maxMetadataNameLength) {
final String fieldName = jParser.getCurrentName();
@@ -848,18 +848,17 @@ public class TikaExtractor extends org.a
totalMetadataLength -= fieldName.length();
metadata.remove(fieldName);
}
- } else if
(fieldName.startsWith("X-TIKA:EXCEPTION:")) {
+ } else if
(fieldName.startsWith("X-TIKA:EXCEPTION:")) { // deal with Tika exceptions
boolean unknownException = false;
if
(fieldName.contentEquals("X-TIKA:EXCEPTION:write_limit_reached")) {
resultCode = "TRUNCATEDOK";
truncated = true;
} else if
(fieldName.contentEquals("X-TIKA:EXCEPTION:embedded_resource_limit_reached")) {
resources_limit = true;
- } else {
+ } else if
(!fieldName.contentEquals("X-TIKA:EXCEPTION:warn")) { // If the exception is
other than a warning message
unknownException = true;
resultCode = "TIKAEXCEPTION";
- jParser.nextToken();
- description += fieldName + ": " +
jParser.getText() + System.lineSeparator();
+ description += getTikaExceptionDesc(jParser) +
System.lineSeparator();
}
if (!unknownException) {
skipMetadata(jParser);
@@ -1047,6 +1046,21 @@ public class TikaExtractor extends org.a
}
}
+ private String getTikaExceptionDesc(final JsonParser jParser) throws
IOException {
+ final StringBuilder exceptionDescBuilder = new StringBuilder();
+ JsonToken token = jParser.nextToken();
+ if (token == JsonToken.START_ARRAY) {
+ token = jParser.nextToken();
+ while (token != JsonToken.END_ARRAY) {
+ exceptionDescBuilder.append(jParser.getText());
+ token = jParser.nextToken();
+ }
+ } else {
+ exceptionDescBuilder.append(jParser.getText());
+ }
+ return exceptionDescBuilder.toString();
+ }
+
private void removeField(final RepositoryDocument document, final String
fieldName) {
final Iterator<String> fields = document.getFields();
while (fields.hasNext()) {