This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 2b3dd510f TIKA-1180: Add MatroskaDetector for improved MKV/WEBM
detection (#2251)
2b3dd510f is described below
commit 2b3dd510f3702ea005f2916122c24d66d203743a
Author: Siraj A. <[email protected]>
AuthorDate: Wed Sep 10 13:12:03 2025 -0400
TIKA-1180: Add MatroskaDetector for improved MKV/WEBM detection (#2251)
---
.../org/apache/tika/detect/MatroskaDetector.java | 88 ++++++++++++++++++++++
.../services/org.apache.tika.detect.Detector | 16 ++++
.../apache/tika/detect/MatroskaDetectorTest.java | 26 +++++++
3 files changed, 130 insertions(+)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java
new file mode 100644
index 000000000..27aa17ba1
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an "AS IS"
+ * BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Objects;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Detector for Matroska (MKV and WEBM) files based on the EBML header.
+ */
+public class MatroskaDetector implements Detector {
+
+ /** For serialization compatibility. */
+ private static final long serialVersionUID = 1L;
+
+ private static final MediaType MATROSKA =
+ MediaType.application("x-matroska");
+
+ private static final MediaType WEBM =
+ MediaType.video("webm");
+
+ private static final byte[] EBML_HEADER =
+ new byte[]{0x1A, 0x45, (byte) 0xDF, (byte) 0xA3};
+
+ /**
+ * Detects the media type of the input stream by inspecting EBML headers.
+ *
+ * @param input the input stream
+ * @param metadata the metadata to populate
+ * @return detected MediaType (WEBM, Matroska, or OCTET_STREAM)
+ * @throws IOException if an I/O error occurs
+ */
+ @Override
+ public MediaType detect(InputStream input, Metadata metadata) throws
IOException {
+ Objects.requireNonNull(input, "input stream must not be null");
+ input.mark(64);
+
+ byte[] header = new byte[64];
+ int bytesRead = input.read(header);
+ input.reset();
+
+ if (bytesRead < EBML_HEADER.length) {
+ return MediaType.OCTET_STREAM;
+ }
+
+ for (int i = 0; i < EBML_HEADER.length; i++) {
+ if (header[i] != EBML_HEADER[i]) {
+ return MediaType.OCTET_STREAM;
+ }
+ }
+
+ for (int i = 4; i < bytesRead - 4; i++) {
+ if (header[i] == 'w'
+ && header[i + 1] == 'e'
+ && header[i + 2] == 'b'
+ && header[i + 3] == 'm') {
+ return WEBM;
+ }
+ if (header[i] == 'm'
+ && header[i + 1] == 'a'
+ && header[i + 2] == 't'
+ && header[i + 3] == 'r') {
+ return MATROSKA;
+ }
+ }
+
+ return MediaType.OCTET_STREAM;
+ }
+}
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
new file mode 100644
index 000000000..1428c6c4d
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.tika.detect.MatroskaDetector
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java
new file mode 100644
index 000000000..32ed8daa1
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java
@@ -0,0 +1,26 @@
+package org.apache.tika.detect;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.junit.jupiter.api.Test;
+
+public class MatroskaDetectorTest {
+
+ private final MatroskaDetector detector = new MatroskaDetector();
+
+ private InputStream getResourceAsStream(String resourcePath) {
+ return this.getClass().getResourceAsStream(resourcePath);
+ }
+
+ @Test
+ public void testDetectMKV() throws IOException {
+ assertEquals(MediaType.video("x-matroska"),
+
detector.detect(getResourceAsStream("/test-documents/sample.nonexist"),
+ new Metadata()));
+ }
+}
\ No newline at end of file