This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 408c26e1e0 TIKA-4612 -- improve mp3 and aac detection (#2520)
408c26e1e0 is described below

commit 408c26e1e03e018a623e732dff6fb047a2fb8e19
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jan 9 12:48:22 2026 -0500

    TIKA-4612 -- improve mp3 and aac detection (#2520)
---
 .../org/apache/tika/mime/tika-mimetypes.xml         |  20 ++++++++++++++------
 .../java/org/apache/tika/mime/TestMimeTypes.java    |   9 +++++++++
 .../test-documents/testMP3_id3_false_aac.mp3        | Bin 0 -> 1024 bytes
 3 files changed, 23 insertions(+), 6 deletions(-)

diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 5c5523f47c..27ccbdb033 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5928,7 +5928,7 @@
     <alias type="audio/x-mpeg"/>
     <acronym>MP3</acronym>
     <_comment>MPEG-1 Audio Layer 3</_comment>
-    <magic priority="20">
+    <magic priority="50">
       <!-- http://mpgedit.org/mpgedit/mpeg_format/MP3Format.html -->
       <!-- Bit pattern for first two bytes: 11111111 111VVLLC    -->
       <!-- VV = MPEG Audio Version ID; 10 = V2, 11 = V1          -->
@@ -5948,11 +5948,18 @@
       <!-- TIKA-417: This is the UTF-16 LE byte order mark! -->
       <!-- match value="0xfffe" type="string" offset="0"/ --> <!-- V1, L1, CRC 
-->
       <match value="0xffff" type="string" offset="0"/> <!-- V1, L1      -->
-      <match value="ID3" type="string" offset="0"/>
+      <!-- TIKA-4582: Require MP3 frame sync after ID3 tag to avoid false 
positives with other ID3-tagged formats -->
+      <match value="ID3" type="string" offset="0">
+         <match type="regex" value="\\xFF[\\xE3\\xF2-\\xF7\\xFA-\\xFD\\xFF]" 
offset="512:8192" />
+      </match>
       <!-- in the wild, 0D0A or quite a few \x00 may precede the magic -->
       <match 
value="(?:\\x0D\\x0A|\\x00{1,1024})(?:\\xff[\\xe3\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd\\xfe\\xff]|ID3)"
              type="regex" offset="0"/>
     </magic>
+    <!-- TIKA-4582: Low-priority fallback for ID3-only files (truncated or 
very large ID3 tags) -->
+    <magic priority="10">
+      <match value="ID3" type="string" offset="0"/>
+    </magic>
     <glob pattern="*.mpga"/>
     <glob pattern="*.mp2"/>
     <glob pattern="*.mp2a"/>
@@ -6159,13 +6166,14 @@
     <alias type="audio/aac"/>
     <glob pattern="*.aac"/>
     <magic priority="30">
-      <!-- Without ID3 tags -->
-      <match type="regex" 
value="\\xFF(\\xF0|\\xF1|\\xF8|\\xF9)(\\x40|\\x41|\\x44|\\x45|\\x48|\\x49|\\x4C|\\x4D|\\x50|\\x51|\\x54|\\x55|\\x58|\\x59|\\x5C|\\x5D|\\x60|\\x61|\\x64|\\x65|\\x68|\\x69|\\x6C|\\x6D|\\x70|\\x71|\\x80|\\x81|\\x84|\\x85|\\x88|\\x89|\\x8C|\\x8D|\\x90|\\x91|\\x94|\\x95|\\x98|\\x99|\\x9C|\\x9D|\\xA0|\\xA1|\\xA4|\\xA5|\\xA8|\\xA9|\\xAC|\\xAD|\\xB0|\\xB1)(\\x00|\\x01|\\x20|\\x40|\\x41|\\x60|\\x80|\\x81|\\x60|\\xA0|\\xC0|\\xC1|\\xE0)"
 offset="0" />
+      <!-- Without ID3 tags - require two consecutive ADTS frame syncs -->
+      <!-- ADTS frames are typically 100-2000 bytes, so look for second sync 
at that distance -->
+      <match type="regex" 
value="(?s)\\xFF[\\xF0\\xF1\\xF8\\xF9].{2}.{100,2000}\\xFF[\\xF0\\xF1\\xF8\\xF9]"
 offset="0" />
     </magic>
     <magic priority="40">
-      <!-- With ID3 tags at the start -->
+      <!-- With ID3 tags at the start - require two consecutive ADTS frame 
syncs -->
       <match value="ID3" type="string" offset="0">
-         <match type="regex" 
value="\\xFF(\\xF0|\\xF1|\\xF8|\\xF9)(\\x40|\\x41|\\x44|\\x45|\\x48|\\x49|\\x4C|\\x4D|\\x50|\\x51|\\x54|\\x55|\\x58|\\x59|\\x5C|\\x5D|\\x60|\\x61|\\x64|\\x65|\\x68|\\x69|\\x6C|\\x6D|\\x70|\\x71|\\x80|\\x81|\\x84|\\x85|\\x88|\\x89|\\x8C|\\x8D|\\x90|\\x91|\\x94|\\x95|\\x98|\\x99|\\x9C|\\x9D|\\xA0|\\xA1|\\xA4|\\xA5|\\xA8|\\xA9|\\xAC|\\xAD|\\xB0|\\xB1)(\\x00|\\x01|\\x20|\\x40|\\x41|\\x60|\\x80|\\x81|\\x60|\\xA0|\\xC0|\\xC1|\\xE0)"
 offset="512:2048" />
+         <match type="regex" 
value="(?s)\\xFF[\\xF0\\xF1\\xF8\\xF9].{2}.{100,2000}\\xFF[\\xF0\\xF1\\xF8\\xF9]"
 offset="512:8192" />
       </match>
     </magic>
   </mime-type>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 25bb8347fc..ae731d9f30 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -1385,6 +1385,15 @@ public class TestMimeTypes {
         assertTypeByName("audio/x-aac", "x.aac");
     }
 
+    /**
+     * TIKA-4582: MP3 files with ID3 tags should not be misdetected as AAC.
+     * This test file contains random audio data that happens to match the AAC 
ADTS sync pattern.
+     */
+    @Test
+    public void testMP3Detection() throws Exception {
+        assertTypeByData("audio/mpeg", "testMP3_id3_false_aac.mp3");
+    }
+
     private void assertText(byte[] prefix) throws IOException {
         assertMagic("text/plain", prefix);
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testMP3_id3_false_aac.mp3
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testMP3_id3_false_aac.mp3
new file mode 100644
index 0000000000..c8366dc2c7
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testMP3_id3_false_aac.mp3
 differ

Reply via email to