Author: jmssiera
Date: Mon Sep 26 15:47:05 2022
New Revision: 1904280

URL: http://svn.apache.org/viewvc?rev=1904280&view=rev
Log:
CONNECTORS-1735: TikaServiceRmeta does not properly handle embedded resources

Modified:
    
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java

Modified: 
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java?rev=1904280&r1=1904279&r2=1904280&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java
 (original)
+++ 
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java
 Mon Sep 26 15:47:05 2022
@@ -695,7 +695,7 @@ public class TikaExtractor extends org.a
 
     try {
       final Map<String, List<String>> metadata = new HashMap<>();
-      final Map<String, List<String>> embeddedResourcesMetadata = new 
HashMap<>();
+      final List<String> embeddedResourcesNames = new ArrayList<>();
       if (document.getFileName() != null) {
         metadata.put(TikaMetadataKeys.RESOURCE_NAME_KEY, new ArrayList<>());
         metadata.put("stream_name", new ArrayList<>());
@@ -878,9 +878,48 @@ public class TikaExtractor extends org.a
                         skipMetadata(jParser);
                       }
                     }
+
+                    // If token not null then there are embedded resources, 
process them if the extractArchives option is enabled
+                    if (token != null && token == JsonToken.END_OBJECT && 
sp.extractArchives) {
+                      // For embedded resource we only gather resourceNames 
and resources content, skip the rest
+                      while ((token = jParser.nextToken()) != null) {
+                        final String fieldName = jParser.getCurrentName();
+                        if (fieldName != null && 
fieldName.contentEquals("resourceName")) {
+                          token = jParser.nextToken();
+                          if (jParser.getTextLength() <= 
sp.maxMetadataValueLength) {
+                            embeddedResourcesNames.add(jParser.getText());
+                          } else {
+                            metadataSkipped = true;
+                          }
+                        } else if (fieldName != null && 
fieldName.contentEquals("X-TIKA:content")) {
+                          // Add embedded resource content to main document 
content
+                          jParser.nextToken();
+                          length += jParser.getText(w);
+                        }
+                      }
+                    }
+
                     jParser.close();
                   }
 
+                  // If the are embedded resources, add their names, if 
possible, to the metadata
+                  for (final String embeddedResourceName : 
embeddedResourcesNames) {
+                    final int resourceNameBytesLength = 
embeddedResourceName.getBytes().length;
+
+                    final int totalMetadataLengthPreview = totalMetadataLength 
+ resourceNameBytesLength;
+                    if (totalMetadataLengthPreview <= sp.totalMetadataLimit) {
+                      if (!metadata.containsKey("embeddedResourcesNames")) {
+                        totalMetadataLength += 
"embeddedResourcesNames".getBytes().length;
+                        metadata.put("embeddedResourcesNames", new 
ArrayList<>());
+                      }
+                      
metadata.get("embeddedResourcesNames").add(embeddedResourceName);
+                      totalMetadataLength += resourceNameBytesLength;
+                    } else {
+                      maxMetadataReached = true;
+                    }
+
+                  }
+
                   if (maxMetadataReached) {
                     description += "Some metadata have been skipped because 
the total metadata limit of " + sp.totalMetadataLimit + " has been reached" + 
System.lineSeparator();
                   } else if (metadataSkipped) {


Reply via email to