This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4612 in repository https://gitbox.apache.org/repos/asf/tika.git
commit efa71ce8376f636ff24ee33b64b81670f1b886d0 Author: tallison <[email protected]> AuthorDate: Fri Jan 9 10:04:56 2026 -0500 TIKA-4612 -- improve mp3 and aac detection --- .../org/apache/tika/mime/tika-mimetypes.xml | 20 ++++++++++++++------ .../java/org/apache/tika/mime/TestMimeTypes.java | 9 +++++++++ .../test-documents/testMP3_id3_false_aac.mp3 | Bin 0 -> 1024 bytes 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 5c5523f47c..27ccbdb033 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -5928,7 +5928,7 @@ <alias type="audio/x-mpeg"/> <acronym>MP3</acronym> <_comment>MPEG-1 Audio Layer 3</_comment> - <magic priority="20"> + <magic priority="50"> <!-- http://mpgedit.org/mpgedit/mpeg_format/MP3Format.html --> <!-- Bit pattern for first two bytes: 11111111 111VVLLC --> <!-- VV = MPEG Audio Version ID; 10 = V2, 11 = V1 --> @@ -5948,11 +5948,18 @@ <!-- TIKA-417: This is the UTF-16 LE byte order mark! --> <!-- match value="0xfffe" type="string" offset="0"/ --> <!-- V1, L1, CRC --> <match value="0xffff" type="string" offset="0"/> <!-- V1, L1 --> - <match value="ID3" type="string" offset="0"/> + <!-- TIKA-4582: Require MP3 frame sync after ID3 tag to avoid false positives with other ID3-tagged formats --> + <match value="ID3" type="string" offset="0"> + <match type="regex" value="\\xFF[\\xE3\\xF2-\\xF7\\xFA-\\xFD\\xFF]" offset="512:8192" /> + </match> <!-- in the wild, 0D0A or quite a few \x00 may precede the magic --> <match value="(?:\\x0D\\x0A|\\x00{1,1024})(?:\\xff[\\xe3\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd\\xfe\\xff]|ID3)" type="regex" offset="0"/> </magic> + <!-- TIKA-4582: Low-priority fallback for ID3-only files (truncated or very large ID3 tags) --> + <magic priority="10"> + <match value="ID3" type="string" offset="0"/> + </magic> <glob pattern="*.mpga"/> <glob pattern="*.mp2"/> <glob pattern="*.mp2a"/> @@ -6159,13 +6166,14 @@ <alias type="audio/aac"/> <glob pattern="*.aac"/> <magic priority="30"> - <!-- Without ID3 tags --> - <match type="regex" value="\\xFF(\\xF0|\\xF1|\\xF8|\\xF9)(\\x40|\\x41|\\x44|\\x45|\\x48|\\x49|\\x4C|\\x4D|\\x50|\\x51|\\x54|\\x55|\\x58|\\x59|\\x5C|\\x5D|\\x60|\\x61|\\x64|\\x65|\\x68|\\x69|\\x6C|\\x6D|\\x70|\\x71|\\x80|\\x81|\\x84|\\x85|\\x88|\\x89|\\x8C|\\x8D|\\x90|\\x91|\\x94|\\x95|\\x98|\\x99|\\x9C|\\x9D|\\xA0|\\xA1|\\xA4|\\xA5|\\xA8|\\xA9|\\xAC|\\xAD|\\xB0|\\xB1)(\\x00|\\x01|\\x20|\\x40|\\x41|\\x60|\\x80|\\x81|\\x60|\\xA0|\\xC0|\\xC1|\\xE0)" offset="0" /> + <!-- Without ID3 tags - require two consecutive ADTS frame syncs --> + <!-- ADTS frames are typically 100-2000 bytes, so look for second sync at that distance --> + <match type="regex" value="(?s)\\xFF[\\xF0\\xF1\\xF8\\xF9].{2}.{100,2000}\\xFF[\\xF0\\xF1\\xF8\\xF9]" offset="0" /> </magic> <magic priority="40"> - <!-- With ID3 tags at the start --> + <!-- With ID3 tags at the start - require two consecutive ADTS frame syncs --> <match value="ID3" type="string" offset="0"> - <match type="regex" value="\\xFF(\\xF0|\\xF1|\\xF8|\\xF9)(\\x40|\\x41|\\x44|\\x45|\\x48|\\x49|\\x4C|\\x4D|\\x50|\\x51|\\x54|\\x55|\\x58|\\x59|\\x5C|\\x5D|\\x60|\\x61|\\x64|\\x65|\\x68|\\x69|\\x6C|\\x6D|\\x70|\\x71|\\x80|\\x81|\\x84|\\x85|\\x88|\\x89|\\x8C|\\x8D|\\x90|\\x91|\\x94|\\x95|\\x98|\\x99|\\x9C|\\x9D|\\xA0|\\xA1|\\xA4|\\xA5|\\xA8|\\xA9|\\xAC|\\xAD|\\xB0|\\xB1)(\\x00|\\x01|\\x20|\\x40|\\x41|\\x60|\\x80|\\x81|\\x60|\\xA0|\\xC0|\\xC1|\\xE0)" offset="512:2048" /> + <match type="regex" value="(?s)\\xFF[\\xF0\\xF1\\xF8\\xF9].{2}.{100,2000}\\xFF[\\xF0\\xF1\\xF8\\xF9]" offset="512:8192" /> </match> </magic> </mime-type> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java index 25bb8347fc..ae731d9f30 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -1385,6 +1385,15 @@ public class TestMimeTypes { assertTypeByName("audio/x-aac", "x.aac"); } + /** + * TIKA-4582: MP3 files with ID3 tags should not be misdetected as AAC. + * This test file contains random audio data that happens to match the AAC ADTS sync pattern. + */ + @Test + public void testMP3Detection() throws Exception { + assertTypeByData("audio/mpeg", "testMP3_id3_false_aac.mp3"); + } + private void assertText(byte[] prefix) throws IOException { assertMagic("text/plain", prefix); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testMP3_id3_false_aac.mp3 b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testMP3_id3_false_aac.mp3 new file mode 100644 index 0000000000..c8366dc2c7 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testMP3_id3_false_aac.mp3 differ
