This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 2b3dd510f TIKA-1180: Add MatroskaDetector for improved MKV/WEBM 
detection (#2251)
2b3dd510f is described below

commit 2b3dd510f3702ea005f2916122c24d66d203743a
Author: Siraj A. <[email protected]>
AuthorDate: Wed Sep 10 13:12:03 2025 -0400

    TIKA-1180: Add MatroskaDetector for improved MKV/WEBM detection (#2251)
---
 .../org/apache/tika/detect/MatroskaDetector.java   | 88 ++++++++++++++++++++++
 .../services/org.apache.tika.detect.Detector       | 16 ++++
 .../apache/tika/detect/MatroskaDetectorTest.java   | 26 +++++++
 3 files changed, 130 insertions(+)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java
new file mode 100644
index 000000000..27aa17ba1
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an "AS IS"
+ * BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied.  See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Objects;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Detector for Matroska (MKV and WEBM) files based on the EBML header.
+ */
+public class MatroskaDetector implements Detector {
+
+    /** For serialization compatibility. */
+    private static final long serialVersionUID = 1L;
+
+    private static final MediaType MATROSKA =
+            MediaType.application("x-matroska");
+
+    private static final MediaType WEBM =
+            MediaType.video("webm");
+
+    private static final byte[] EBML_HEADER =
+            new byte[]{0x1A, 0x45, (byte) 0xDF, (byte) 0xA3};
+
+    /**
+     * Detects the media type of the input stream by inspecting EBML headers.
+     *
+     * @param input    the input stream
+     * @param metadata the metadata to populate
+     * @return detected MediaType (WEBM, Matroska, or OCTET_STREAM)
+     * @throws IOException if an I/O error occurs
+     */
+    @Override
+    public MediaType detect(InputStream input, Metadata metadata) throws 
IOException {
+        Objects.requireNonNull(input, "input stream must not be null");
+        input.mark(64);
+
+        byte[] header = new byte[64];
+        int bytesRead = input.read(header);
+        input.reset();
+
+        if (bytesRead < EBML_HEADER.length) {
+            return MediaType.OCTET_STREAM;
+        }
+
+        for (int i = 0; i < EBML_HEADER.length; i++) {
+            if (header[i] != EBML_HEADER[i]) {
+                return MediaType.OCTET_STREAM;
+            }
+        }
+
+        for (int i = 4; i < bytesRead - 4; i++) {
+            if (header[i] == 'w'
+                    && header[i + 1] == 'e'
+                    && header[i + 2] == 'b'
+                    && header[i + 3] == 'm') {
+                return WEBM;
+            }
+            if (header[i] == 'm'
+                    && header[i + 1] == 'a'
+                    && header[i + 2] == 't'
+                    && header[i + 3] == 'r') {
+                return MATROSKA;
+            }
+        }
+
+        return MediaType.OCTET_STREAM;
+    }
+}
\ No newline at end of file
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
new file mode 100644
index 000000000..1428c6c4d
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.detect.MatroskaDetector
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java
new file mode 100644
index 000000000..32ed8daa1
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java
@@ -0,0 +1,26 @@
+package org.apache.tika.detect;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.junit.jupiter.api.Test;
+
+public class MatroskaDetectorTest {
+
+    private final MatroskaDetector detector = new MatroskaDetector();
+
+    private InputStream getResourceAsStream(String resourcePath) {
+        return this.getClass().getResourceAsStream(resourcePath);
+    }
+
+    @Test
+    public void testDetectMKV() throws IOException {
+        assertEquals(MediaType.video("x-matroska"),
+                
detector.detect(getResourceAsStream("/test-documents/sample.nonexist"),
+                        new Metadata()));
+    }
+}
\ No newline at end of file

Reply via email to