Author: jmssiera
Date: Mon Sep 26 09:11:04 2022
New Revision: 1904264

URL: http://svn.apache.org/viewvc?rev=1904264&view=rev
Log:
CONNECTORS-1733: TikaServiceRmeta does not properly handle unknown tika 
exceptions

Modified:
    
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java

Modified: 
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java?rev=1904264&r1=1904263&r2=1904264&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java
 (original)
+++ 
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java
 Mon Sep 26 09:11:04 2022
@@ -695,6 +695,7 @@ public class TikaExtractor extends org.a
 
     try {
       final Map<String, List<String>> metadata = new HashMap<>();
+      final Map<String, List<String>> embeddedResourcesMetadata = new 
HashMap<>();
       if (document.getFileName() != null) {
         metadata.put(TikaMetadataKeys.RESOURCE_NAME_KEY, new ArrayList<>());
         metadata.put("stream_name", new ArrayList<>());
@@ -792,7 +793,6 @@ public class TikaExtractor extends org.a
 
                   if (token != null) {
                     while ((token = jParser.nextToken()) != null && token != 
JsonToken.END_OBJECT) {
-
                       final int fieldNameLength = jParser.getTextLength();
                       if (fieldNameLength <= maxMetadataNameLength) {
                         final String fieldName = jParser.getCurrentName();
@@ -848,18 +848,17 @@ public class TikaExtractor extends org.a
                               totalMetadataLength -= fieldName.length();
                               metadata.remove(fieldName);
                             }
-                          } else if 
(fieldName.startsWith("X-TIKA:EXCEPTION:")) {
+                          } else if 
(fieldName.startsWith("X-TIKA:EXCEPTION:")) { // deal with Tika exceptions
                             boolean unknownException = false;
                             if 
(fieldName.contentEquals("X-TIKA:EXCEPTION:write_limit_reached")) {
                               resultCode = "TRUNCATEDOK";
                               truncated = true;
                             } else if 
(fieldName.contentEquals("X-TIKA:EXCEPTION:embedded_resource_limit_reached")) {
                               resources_limit = true;
-                            } else {
+                            } else if 
(!fieldName.contentEquals("X-TIKA:EXCEPTION:warn")) { // If the exception is 
other than a warning message
                               unknownException = true;
                               resultCode = "TIKAEXCEPTION";
-                              jParser.nextToken();
-                              description += fieldName + ": " + 
jParser.getText() + System.lineSeparator();
+                              description += getTikaExceptionDesc(jParser) + 
System.lineSeparator();
                             }
                             if (!unknownException) {
                               skipMetadata(jParser);
@@ -1047,6 +1046,21 @@ public class TikaExtractor extends org.a
     }
   }
 
+  private String getTikaExceptionDesc(final JsonParser jParser) throws 
IOException {
+    final StringBuilder exceptionDescBuilder = new StringBuilder();
+    JsonToken token = jParser.nextToken();
+    if (token == JsonToken.START_ARRAY) {
+      token = jParser.nextToken();
+      while (token != JsonToken.END_ARRAY) {
+        exceptionDescBuilder.append(jParser.getText());
+        token = jParser.nextToken();
+      }
+    } else {
+      exceptionDescBuilder.append(jParser.getText());
+    }
+    return exceptionDescBuilder.toString();
+  }
+
   private void removeField(final RepositoryDocument document, final String 
fieldName) {
     final Iterator<String> fields = document.getFields();
     while (fields.hasNext()) {


Reply via email to