Author: jmssiera
Date: Mon Sep 26 15:47:05 2022
New Revision: 1904280
URL: http://svn.apache.org/viewvc?rev=1904280&view=rev
Log:
CONNECTORS-1735: TikaServiceRmeta does not properly handle embedded resources
Modified:
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java
Modified:
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java?rev=1904280&r1=1904279&r2=1904280&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java
(original)
+++
manifoldcf/trunk/connectors/tikaservice-rmeta/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/rmeta/TikaExtractor.java
Mon Sep 26 15:47:05 2022
@@ -695,7 +695,7 @@ public class TikaExtractor extends org.a
try {
final Map<String, List<String>> metadata = new HashMap<>();
- final Map<String, List<String>> embeddedResourcesMetadata = new
HashMap<>();
+ final List<String> embeddedResourcesNames = new ArrayList<>();
if (document.getFileName() != null) {
metadata.put(TikaMetadataKeys.RESOURCE_NAME_KEY, new ArrayList<>());
metadata.put("stream_name", new ArrayList<>());
@@ -878,9 +878,48 @@ public class TikaExtractor extends org.a
skipMetadata(jParser);
}
}
+
+ // If token not null then there are embedded resources,
process them if the extractArchives option is enabled
+ if (token != null && token == JsonToken.END_OBJECT &&
sp.extractArchives) {
+ // For embedded resource we only gather resourceNames
and resources content, skip the rest
+ while ((token = jParser.nextToken()) != null) {
+ final String fieldName = jParser.getCurrentName();
+ if (fieldName != null &&
fieldName.contentEquals("resourceName")) {
+ token = jParser.nextToken();
+ if (jParser.getTextLength() <=
sp.maxMetadataValueLength) {
+ embeddedResourcesNames.add(jParser.getText());
+ } else {
+ metadataSkipped = true;
+ }
+ } else if (fieldName != null &&
fieldName.contentEquals("X-TIKA:content")) {
+ // Add embedded resource content to main document
content
+ jParser.nextToken();
+ length += jParser.getText(w);
+ }
+ }
+ }
+
jParser.close();
}
+ // If the are embedded resources, add their names, if
possible, to the metadata
+ for (final String embeddedResourceName :
embeddedResourcesNames) {
+ final int resourceNameBytesLength =
embeddedResourceName.getBytes().length;
+
+ final int totalMetadataLengthPreview = totalMetadataLength
+ resourceNameBytesLength;
+ if (totalMetadataLengthPreview <= sp.totalMetadataLimit) {
+ if (!metadata.containsKey("embeddedResourcesNames")) {
+ totalMetadataLength +=
"embeddedResourcesNames".getBytes().length;
+ metadata.put("embeddedResourcesNames", new
ArrayList<>());
+ }
+
metadata.get("embeddedResourcesNames").add(embeddedResourceName);
+ totalMetadataLength += resourceNameBytesLength;
+ } else {
+ maxMetadataReached = true;
+ }
+
+ }
+
if (maxMetadataReached) {
description += "Some metadata have been skipped because
the total metadata limit of " + sp.totalMetadataLimit + " has been reached" +
System.lineSeparator();
} else if (metadataSkipped) {