This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new b98268a845 TIKA-4612 -- improve mp3 and aac detection
b98268a845 is described below
commit b98268a8453aa5b06946800e77008b96a0bc6d7f
Author: tallison <[email protected]>
AuthorDate: Fri Jan 9 10:04:56 2026 -0500
TIKA-4612 -- improve mp3 and aac detection
---
.../org/apache/tika/mime/tika-mimetypes.xml | 20 ++++++++++++++------
.../java/org/apache/tika/mime/TestMimeTypes.java | 9 +++++++++
.../test-documents/testMP3_id3_false_aac.mp3 | Bin 0 -> 1024 bytes
3 files changed, 23 insertions(+), 6 deletions(-)
diff --git
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 5c5523f47c..27ccbdb033 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5928,7 +5928,7 @@
<alias type="audio/x-mpeg"/>
<acronym>MP3</acronym>
<_comment>MPEG-1 Audio Layer 3</_comment>
- <magic priority="20">
+ <magic priority="50">
<!-- http://mpgedit.org/mpgedit/mpeg_format/MP3Format.html -->
<!-- Bit pattern for first two bytes: 11111111 111VVLLC -->
<!-- VV = MPEG Audio Version ID; 10 = V2, 11 = V1 -->
@@ -5948,11 +5948,18 @@
<!-- TIKA-417: This is the UTF-16 LE byte order mark! -->
<!-- match value="0xfffe" type="string" offset="0"/ --> <!-- V1, L1, CRC
-->
<match value="0xffff" type="string" offset="0"/> <!-- V1, L1 -->
- <match value="ID3" type="string" offset="0"/>
+ <!-- TIKA-4582: Require MP3 frame sync after ID3 tag to avoid false
positives with other ID3-tagged formats -->
+ <match value="ID3" type="string" offset="0">
+ <match type="regex" value="\\xFF[\\xE3\\xF2-\\xF7\\xFA-\\xFD\\xFF]"
offset="512:8192" />
+ </match>
<!-- in the wild, 0D0A or quite a few \x00 may precede the magic -->
<match
value="(?:\\x0D\\x0A|\\x00{1,1024})(?:\\xff[\\xe3\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd\\xfe\\xff]|ID3)"
type="regex" offset="0"/>
</magic>
+ <!-- TIKA-4582: Low-priority fallback for ID3-only files (truncated or
very large ID3 tags) -->
+ <magic priority="10">
+ <match value="ID3" type="string" offset="0"/>
+ </magic>
<glob pattern="*.mpga"/>
<glob pattern="*.mp2"/>
<glob pattern="*.mp2a"/>
@@ -6159,13 +6166,14 @@
<alias type="audio/aac"/>
<glob pattern="*.aac"/>
<magic priority="30">
- <!-- Without ID3 tags -->
- <match type="regex"
value="\\xFF(\\xF0|\\xF1|\\xF8|\\xF9)(\\x40|\\x41|\\x44|\\x45|\\x48|\\x49|\\x4C|\\x4D|\\x50|\\x51|\\x54|\\x55|\\x58|\\x59|\\x5C|\\x5D|\\x60|\\x61|\\x64|\\x65|\\x68|\\x69|\\x6C|\\x6D|\\x70|\\x71|\\x80|\\x81|\\x84|\\x85|\\x88|\\x89|\\x8C|\\x8D|\\x90|\\x91|\\x94|\\x95|\\x98|\\x99|\\x9C|\\x9D|\\xA0|\\xA1|\\xA4|\\xA5|\\xA8|\\xA9|\\xAC|\\xAD|\\xB0|\\xB1)(\\x00|\\x01|\\x20|\\x40|\\x41|\\x60|\\x80|\\x81|\\x60|\\xA0|\\xC0|\\xC1|\\xE0)"
offset="0" />
+ <!-- Without ID3 tags - require two consecutive ADTS frame syncs -->
+ <!-- ADTS frames are typically 100-2000 bytes, so look for second sync
at that distance -->
+ <match type="regex"
value="(?s)\\xFF[\\xF0\\xF1\\xF8\\xF9].{2}.{100,2000}\\xFF[\\xF0\\xF1\\xF8\\xF9]"
offset="0" />
</magic>
<magic priority="40">
- <!-- With ID3 tags at the start -->
+ <!-- With ID3 tags at the start - require two consecutive ADTS frame
syncs -->
<match value="ID3" type="string" offset="0">
- <match type="regex"
value="\\xFF(\\xF0|\\xF1|\\xF8|\\xF9)(\\x40|\\x41|\\x44|\\x45|\\x48|\\x49|\\x4C|\\x4D|\\x50|\\x51|\\x54|\\x55|\\x58|\\x59|\\x5C|\\x5D|\\x60|\\x61|\\x64|\\x65|\\x68|\\x69|\\x6C|\\x6D|\\x70|\\x71|\\x80|\\x81|\\x84|\\x85|\\x88|\\x89|\\x8C|\\x8D|\\x90|\\x91|\\x94|\\x95|\\x98|\\x99|\\x9C|\\x9D|\\xA0|\\xA1|\\xA4|\\xA5|\\xA8|\\xA9|\\xAC|\\xAD|\\xB0|\\xB1)(\\x00|\\x01|\\x20|\\x40|\\x41|\\x60|\\x80|\\x81|\\x60|\\xA0|\\xC0|\\xC1|\\xE0)"
offset="512:2048" />
+ <match type="regex"
value="(?s)\\xFF[\\xF0\\xF1\\xF8\\xF9].{2}.{100,2000}\\xFF[\\xF0\\xF1\\xF8\\xF9]"
offset="512:8192" />
</match>
</magic>
</mime-type>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index f72f7abb7f..fef82e945e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -1386,6 +1386,15 @@ public class TestMimeTypes {
assertTypeByName("audio/x-aac", "x.aac");
}
+ /**
+ * TIKA-4582: MP3 files with ID3 tags should not be misdetected as AAC.
+ * This test file contains random audio data that happens to match the AAC
ADTS sync pattern.
+ */
+ @Test
+ public void testMP3Detection() throws Exception {
+ assertTypeByData("audio/mpeg", "testMP3_id3_false_aac.mp3");
+ }
+
private void assertText(byte[] prefix) throws IOException {
assertMagic("text/plain", prefix);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testMP3_id3_false_aac.mp3
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testMP3_id3_false_aac.mp3
new file mode 100644
index 0000000000..c8366dc2c7
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testMP3_id3_false_aac.mp3
differ