This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new 2239f8f66f TIKA-4563 -- various fixes based on regression testing
(#2697)
2239f8f66f is described below
commit 2239f8f66fab5c31586ffd67df528659bf181659
Author: Tim Allison <[email protected]>
AuthorDate: Mon Mar 16 17:23:26 2026 -0400
TIKA-4563 -- various fixes based on regression testing (#2697)
---
.../org/apache/tika/mime/tika-mimetypes.xml | 6 ++-
.../org/apache/tika/eval/app/ExtractComparer.java | 57 +++++++++++++++++++---
.../src/main/resources/comparison-reports-tags.xml | 25 ++++++++++
.../src/main/resources/comparison-reports.xml | 26 ++++++++++
.../parser/microsoft/AbstractPOIFSExtractor.java | 2 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 23 ++++++---
.../org/apache/tika/parser/pkg/PackageParser.java | 17 ++++++-
7 files changed, 137 insertions(+), 19 deletions(-)
diff --git
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index d531459614..e5863a1998 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -4291,7 +4291,7 @@
<mime-type type="application/x-iso9660-image">
<acronym>ISO</acronym>
<_comment>ISO 9660 CD-ROM filesystem data</_comment>
- <magic priority="50">
+ <magic priority="60">
<match value="CD001" type="string" offset="32769"/>
<match value="CD001" type="string" offset="34817"/>
<match value="CD001" type="string" offset="36865"/>
@@ -5952,7 +5952,9 @@
<match value="0xffe3" type="string" offset="0"/> <!-- MP3 2.5 from
pronom -->
<!-- TIKA-417: This is the UTF-16 LE byte order mark! -->
<!-- match value="0xfffe" type="string" offset="0"/ --> <!-- V1, L1, CRC
-->
- <match value="0xffff" type="string" offset="0"/> <!-- V1, L1 -->
+ <!-- 0xffff has layer bits 00 which is "reserved" in the MPEG spec, not
L1.
+ Removed: caused false positives with binary files. -->
+ <!-- match value="0xffff" type="string" offset="0"/ -->
<!-- TIKA-4582: Require MP3 frame sync after ID3 tag to avoid false
positives with other ID3-tagged formats -->
<match value="ID3" type="string" offset="0">
<match type="regex" value="\\xFF[\\xE3\\xF2-\\xF7\\xFA-\\xFD\\xFF]"
offset="512:8192" />
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
index cc96243836..4e150ba865 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
@@ -18,10 +18,13 @@ package org.apache.tika.eval.app;
import java.io.IOException;
import java.nio.file.Path;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
import java.sql.Types;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
@@ -222,6 +225,7 @@ public class ExtractComparer extends AbstractProfiler {
List<Integer> numAttachmentsB = countAttachments(metadataListB);
String sharedDigestKey = findSharedDigestKey(metadataListA,
metadataListB);
+ String emptyDigest = computeEmptyDigest(sharedDigestKey);
Map<Class, Object> tokenStatsA = null;
Map<Class, Object> tokenStatsB = null;
//now get that metadata
@@ -239,7 +243,7 @@ public class ExtractComparer extends AbstractProfiler {
writeProfileData(fpsA, i, contentTagsA, metadataA, fileId,
containerID, numAttachmentsA, PROFILES_A);
writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
- int matchIndex = getMatch(i, sharedDigestKey, handledB,
metadataListA, metadataListB);
+ int matchIndex = getMatch(i, sharedDigestKey, emptyDigest,
handledB, metadataListA, metadataListB);
if (matchIndex > -1 && !handledB.contains(matchIndex)) {
metadataB = metadataListB.get(matchIndex);
@@ -392,7 +396,7 @@ public class ExtractComparer extends AbstractProfiler {
* @param metadataListB
* @return
*/
- private int getMatch(int aIndex, String sharedDigestKey, Set<Integer>
handledB, List<Metadata> metadataListA, List<Metadata> metadataListB) {
+ private int getMatch(int aIndex, String sharedDigestKey, String
emptyDigest, Set<Integer> handledB, List<Metadata> metadataListA,
List<Metadata> metadataListB) {
//TODO: could make this more robust
if (metadataListB == null || metadataListB.size() == 0) {
return -1;
@@ -402,16 +406,23 @@ public class ExtractComparer extends AbstractProfiler {
return 0;
}
+ Metadata thisMetadata = metadataListA.get(aIndex);
+
if (sharedDigestKey != null) {
//first try to find matching digests
- return findMatchingDigests(sharedDigestKey, handledB,
metadataListA.get(aIndex), metadataListB);
+ int digestMatch = findMatchingDigests(sharedDigestKey,
emptyDigest, handledB, thisMetadata, metadataListB);
+ if (digestMatch > -1) {
+ return digestMatch;
+ }
}
- //assume same embedded resource path. Not always true!
- Metadata thisMetadata = metadataListA.get(aIndex);
+ //try matching by embedded resource path
String embeddedPath =
thisMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
if (embeddedPath != null) {
for (int j = 0; j < metadataListB.size(); j++) {
+ if (handledB.contains(j)) {
+ continue;
+ }
String thatEmbeddedPath = metadataListB
.get(j)
.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
@@ -429,11 +440,16 @@ public class ExtractComparer extends AbstractProfiler {
return -1;
}
- private int findMatchingDigests(String sharedDigestKey, Set<Integer>
handledB, Metadata metadata, List<Metadata> metadataListB) {
+ private int findMatchingDigests(String sharedDigestKey, String
emptyDigest, Set<Integer> handledB, Metadata metadata, List<Metadata>
metadataListB) {
String digestA = metadata.get(sharedDigestKey);
if (digestA == null) {
return -1;
}
+ // Skip matching on the empty-content digest -- it's meaningless
+ // and causes false matches among unrelated zero-byte embedded docs
+ if (digestA.equalsIgnoreCase(emptyDigest)) {
+ return -1;
+ }
String resourceName =
metadata.get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH);
int cand = -1;
@@ -453,6 +469,35 @@ public class ExtractComparer extends AbstractProfiler {
return cand;
}
+ /**
+ * Computes the hex-encoded digest of empty (zero-byte) content for the
+ * algorithm identified by the shared digest key (e.g.
"X-TIKA:digest:MD5").
+ * Returns null if the algorithm cannot be resolved.
+ */
+ private static String computeEmptyDigest(String sharedDigestKey) {
+ if (sharedDigestKey == null) {
+ return null;
+ }
+ // key format: "X-TIKA:digest:MD5" or "X-TIKA:digest:SHA256" etc.
+ String algo = sharedDigestKey.substring(DIGEST_KEY_PREFIX.length());
+ // normalize common names to MessageDigest algorithm names
+ // e.g. SHA256 -> SHA-256
+ if (algo.matches("(?i)SHA(\\d+)")) {
+ algo = algo.toUpperCase(Locale.ROOT).replaceFirst("SHA(\\d+)",
"SHA-$1");
+ }
+ try {
+ MessageDigest md = MessageDigest.getInstance(algo);
+ byte[] emptyHash = md.digest(new byte[0]);
+ StringBuilder sb = new StringBuilder();
+ for (byte b : emptyHash) {
+ sb.append(String.format(Locale.ROOT, "%02x", b));
+ }
+ return sb.toString();
+ } catch (NoSuchAlgorithmException e) {
+ return null;
+ }
+ }
+
private void writeContrasts(Map<Cols, String> data, ContrastStatistics
contrastStatistics) {
writeContrastString(data, Cols.TOP_10_MORE_IN_A,
contrastStatistics.getTopNMoreA());
writeContrastString(data, Cols.TOP_10_MORE_IN_B,
contrastStatistics.getTopNMoreB());
diff --git
a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml
b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml
index 6acb475901..90f6a7fad7 100644
--- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml
@@ -1621,6 +1621,31 @@
order by cnt desc
</sql>
</report>
+ <report reportName="Attachment Name Changes"
+ reportFilename="attachments/attachment_name_diffs.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select file_path,
+ pa.embedded_file_path as EMBEDDED_NAME_A,
+ pb.embedded_file_path as EMBEDDED_NAME_B,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B
+ from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
+ where pa.is_embedded=true
+ and pb.is_embedded=true
+ and pa.embedded_file_path is not null
+ and pb.embedded_file_path is not null
+ and pa.embedded_file_path <> pb.embedded_file_path
+ order by file_path, pa.embedded_file_path
+ limit 100000
+ </sql>
+ </report>
<!-- metadata values -->
<report reportName="Metadata Value Diffs"
reportFilename="metadata/metadata_value_count_diffs.xlsx"
diff --git a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
index 667578daa9..082949e9cf 100644
--- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
@@ -1489,6 +1489,32 @@
limit 20000;
</sql>
</report>
+ <!-- attachment name changes -->
+ <report reportName="Attachment Name Changes"
+ reportFilename="attachments/attachment_name_diffs.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select file_path,
+ pa.embedded_file_path as EMBEDDED_NAME_A,
+ pb.embedded_file_path as EMBEDDED_NAME_B,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B
+ from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
+ where pa.is_embedded=true
+ and pb.is_embedded=true
+ and pa.embedded_file_path is not null
+ and pb.embedded_file_path is not null
+ and pa.embedded_file_path <> pb.embedded_file_path
+ order by file_path, pa.embedded_file_path
+ limit 100000
+ </sql>
+ </report>
<after>
<sql>drop table if exists md5_multiples_tmp_a</sql>
<sql>drop table if exists md5_multiples_tmp_b</sql>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index d4294f4862..5bc93a3a96 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -323,7 +323,7 @@ abstract class AbstractPOIFSExtractor {
}
// Record what we can do about it
- metadata.set(Metadata.CONTENT_TYPE, mediaType.getType());
+ metadata.set(Metadata.CONTENT_TYPE, mediaType.toString());
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName +
extension);
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
parseEmbedded(parentDir, tis, xhtml, metadata, outputHtml);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 8b41630f32..fe63fe156d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -304,8 +304,10 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
metadata.set(Office.HAS_EXTERNAL_PIVOT_DATA, true);
}
}
- } catch (IOException | TikaException | SAXException e) {
- // swallow
+ } catch (IOException | TikaException | SAXException |
IllegalArgumentException e) {
+ // swallow -- POI throws IllegalArgumentException when a
+ // relationship references a part missing from the package
+ // (e.g. truncated files)
}
}
}
@@ -364,8 +366,9 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
metadata.set(Office.HAS_DDE_LINKS, true);
}
}
- } catch (IOException | TikaException e) {
- // swallow
+ } catch (IOException | TikaException |
IllegalArgumentException e) {
+ // swallow -- POI can throw IllegalArgumentException
+ // for malformed relationships
}
}
}
@@ -395,8 +398,10 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
metadata.set(Office.HAS_WEB_QUERIES, true);
}
}
- } catch (IOException | TikaException e) {
- // swallow
+ } catch (IOException | TikaException | IllegalArgumentException e)
{
+ // swallow -- POI throws IllegalArgumentException when a
+ // relationship references a part missing from the package
+ // (e.g. truncated files)
}
}
}
@@ -418,8 +423,10 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
XMLReaderUtils.parseSAX(is, new
QueryTableHandler(xhtml), parseContext);
}
}
- } catch (IOException | TikaException e) {
- // swallow
+ } catch (IOException | TikaException | IllegalArgumentException e)
{
+ // swallow -- POI throws IllegalArgumentException when a
+ // relationship references a part missing from the package
+ // (e.g. truncated files)
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 44a6752ea9..25b4fa1ccc 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -414,9 +414,20 @@ public class PackageParser extends
AbstractEncodingDetectorParser {
// Try to detect charset of archive entry in case of non-unicode
filename is used
if (detectCharsetsInEntryNames) {
byte[] entryName = entry.getRawName();
- if (entryName != null && entryName.length >=
MIN_BYTES_FOR_DETECTING_CHARSET) {
+ if (entryName != null && entryName.length > 0) {
+ // Extend short entry name to improve accuracy of charset
detection
+ byte[] extendedEntryName = entryName;
+ if (entryName.length < MIN_BYTES_FOR_DETECTING_CHARSET) {
+ int len = entryName.length *
(MIN_BYTES_FOR_DETECTING_CHARSET / entryName.length);
+ extendedEntryName = new byte[len];
+ for (int i = 0; i < len; i++) {
+ extendedEntryName[i] = entryName[i % entryName.length];
+ }
+ }
Charset charset = getEncodingDetector().detect(
- new UnsynchronizedByteArrayInputStream(entryName), new
Metadata());
+ UnsynchronizedByteArrayInputStream.builder()
+ .setByteArray(extendedEntryName).get(),
+ parentMetadata);
if (charset != null) {
name = new String(entryName, charset);
}
@@ -431,6 +442,8 @@ public class PackageParser extends
AbstractEncodingDetectorParser {
if (extractor.shouldParseEmbedded(entryMetadata)) {
try (InputStream entryStream = zipFile.getInputStream(entry)) {
extractor.parseEmbedded(entryStream, xhtml, entryMetadata,
true);
+ } catch (UnsupportedZipFeatureException e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
parentMetadata);
}
}
}