This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new cf7d87a046 TIKA-4747 -- add axml detection (#2865)
cf7d87a046 is described below

commit cf7d87a046f099c8a9b681bbb0c18c3e05c816ae
Author: Tim Allison <[email protected]>
AuthorDate: Thu Jun 4 21:38:58 2026 -0400

    TIKA-4747 -- add axml detection (#2865)
---
 .../org/apache/tika/mime/tika-mimetypes.xml        |  15 +++
 .../org/apache/tika/mime/MimeDetectionTest.java    |   5 +
 .../org/apache/tika/mime/test-android-binary.xml   | Bin 0 -> 64 bytes
 .../apache/tika/parser/AndroidBinaryXMLTest.java   | 122 +++++++++++++++++++++
 4 files changed, 142 insertions(+)

diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index f447bf3d97..34b9d27943 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -352,6 +352,21 @@
     <sub-class-of type="application/java-archive"/>
     <glob pattern="*.apk"/>
   </mime-type>
+  <mime-type type="application/vnd.android.axml">
+    <acronym>AXML</acronym>
+    <_comment>Android Binary XML</_comment>
+    
<tika:link>https://developer.android.com/guide/topics/manifest/manifest-intro</tika:link>
+    <!-- Compiled AndroidManifest.xml / res/*.xml inside an APK. The .xml 
extension would
+         otherwise route it to the XML parser, which fails on the binary 
header. Signature:
+         RES_XML_TYPE(0x0003)+headerSize(0x0008)=0x00080003 LE, plus 
RES_STRING_POOL_TYPE
+         (0x0001) at offset 8 (the variable per-file size at offset 4 is 
skipped). Not a
+         sub-class-of application/xml: must not reach an XML parser. -->
+    <magic priority="50">
+      <match value="0x03000800" type="string" offset="0">
+        <match value="0x0001" type="little16" offset="8"/>
+      </match>
+    </magic>
+  </mime-type>
   <mime-type type="application/x-tika-java-enterprise-archive">
     <sub-class-of type="application/java-archive"/>
     <glob pattern="*.ear"/>
diff --git 
a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java 
b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
index 76268b5fea..77d4604471 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
@@ -85,6 +85,11 @@ public class MimeDetectionTest {
 
         // truncated xml should still be detected as xml, See TIKA-3596
         testFile("application/xml", "truncated-utf16-xml.xyz");
+
+        // Android Binary XML (compiled AndroidManifest.xml / res/*.xml inside 
an APK).
+        // Carries a .xml extension, so magic must win over the *.xml glob and 
it must
+        // NOT be routed to application/xml / the XML parser. See TIKA-4747.
+        testFile("application/vnd.android.axml", "test-android-binary.xml");
     }
 
     @Test
diff --git 
a/tika-core/src/test/resources/org/apache/tika/mime/test-android-binary.xml 
b/tika-core/src/test/resources/org/apache/tika/mime/test-android-binary.xml
new file mode 100644
index 0000000000..d022dee128
Binary files /dev/null and 
b/tika-core/src/test/resources/org/apache/tika/mime/test-android-binary.xml 
differ
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/AndroidBinaryXMLTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/AndroidBinaryXMLTest.java
new file mode 100644
index 0000000000..41286f218d
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/AndroidBinaryXMLTest.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.io.ByteArrayOutputStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipOutputStream;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+/**
+ * Android Binary XML (AXML) is the compiled binary form of 
AndroidManifest.xml and the
+ * res/*.xml resources packed inside an APK. Those entries keep a .xml 
extension and live
+ * inside the (zip) APK, so before TIKA-4747 the *.xml glob caused them to be 
detected as
+ * application/xml and handed to the XML parser, which failed on the binary 
header with
+ * "Invalid byte 1 of 1-byte UTF-8 sequence". This was a large source of 
exceptions in
+ * regression runs over APK-heavy corpora.
+ *
+ * <p>Real corpus APKs can't be committed, so this builds an equivalent zip in 
memory:
+ * two compiled (AXML) entries plus one genuine text-XML entry under assets/ 
as a control,
+ * and asserts the AXML entries are detected as application/vnd.android.axml 
and produce no
+ * exception, while the text-XML entry is still application/xml.
+ */
+public class AndroidBinaryXMLTest extends TikaTest {
+
+    private static final String AXML = "application/vnd.android.axml";
+
+    /**
+     * Minimal compiled-AXML header: a RES_XML_TYPE ResChunk_header plus the 
ResStringPool
+     * chunk real AXML always carries. The magic matches 0x00080003 (LE) at 
offset 0 and the
+     * string-pool type 0x0001 at offset 8, so both must be present.
+     */
+    private static byte[] axmlBytes() {
+        ByteBuffer bb = ByteBuffer.allocate(64).order(ByteOrder.LITTLE_ENDIAN);
+        bb.putShort((short) 0x0003);   // RES_XML_TYPE
+        bb.putShort((short) 0x0008);   // headerSize
+        bb.putInt(64);                 // total chunk size == file length 
(skipped by magic)
+        bb.putShort((short) 0x0001);   // RES_STRING_POOL_TYPE (checked at 
offset 8)
+        bb.putShort((short) 0x001C);   // string-pool headerSize
+        bb.putInt(0x00000038);         // string-pool chunk size == 64 - 8 
(spans offset 8..EOF)
+        // remaining bytes (string/style counts, flags, offsets) left zero
+        return bb.array();
+    }
+
+    private static byte[] zipWith(String[] names, byte[][] contents) throws 
Exception {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        try (ZipOutputStream zos = new ZipOutputStream(bos)) {
+            for (int i = 0; i < names.length; i++) {
+                zos.putNextEntry(new ZipEntry(names[i]));
+                zos.write(contents[i]);
+                zos.closeEntry();
+            }
+        }
+        return bos.toByteArray();
+    }
+
+    @Test
+    public void testAxmlInsideZipNotRoutedToXmlParser() throws Exception {
+        byte[] textXml =
+                "<?xml 
version=\"1.0\"?><root><city>example</city></root>".getBytes(StandardCharsets.UTF_8);
+        byte[] zip = zipWith(
+                new String[] {"AndroidManifest.xml", "res/anim/anim0to1.xml", 
"assets/province_data.xml"},
+                new byte[][] {axmlBytes(), axmlBytes(), textXml});
+
+        List<Metadata> metadataList;
+        try (TikaInputStream tis = TikaInputStream.get(zip)) {
+            metadataList = getRecursiveMetadata(tis, true);
+        }
+
+        Metadata manifest = byPathSuffix(metadataList, "AndroidManifest.xml");
+        Metadata resAnim = byPathSuffix(metadataList, "anim0to1.xml");
+        Metadata assetXml = byPathSuffix(metadataList, "province_data.xml");
+
+        // The two compiled AXML entries: detected as AXML, NOT routed to the 
XML parser.
+        assertEquals(AXML, manifest.get(Metadata.CONTENT_TYPE));
+        assertEquals(AXML, resAnim.get(Metadata.CONTENT_TYPE));
+        assertNull(manifest.get(TikaCoreProperties.EMBEDDED_EXCEPTION),
+                "AXML manifest must not throw a parse exception");
+        assertNull(resAnim.get(TikaCoreProperties.EMBEDDED_EXCEPTION),
+                "AXML resource must not throw a parse exception");
+
+        // Control: a genuine text XML under assets/ is still detected and 
parsed as XML.
+        assertEquals("application/xml", assetXml.get(Metadata.CONTENT_TYPE));
+        assertNull(assetXml.get(TikaCoreProperties.EMBEDDED_EXCEPTION));
+    }
+
+    private static Metadata byPathSuffix(List<Metadata> metadataList, String 
suffix) {
+        for (Metadata m : metadataList) {
+            String path = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
+            if (path != null && path.endsWith(suffix)) {
+                return m;
+            }
+        }
+        throw new AssertionError("No embedded entry found ending with: " + 
suffix);
+    }
+}

Reply via email to