This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4344
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 6f2bdeb6a435762a6fa8e1d4b61960f8129e1e2b
Author: tallison <[email protected]>
AuthorDate: Tue Nov 5 11:26:00 2024 -0500

    TIKA-4344 -- add a magika wrapper
---
 pom.xml                                            |   2 +-
 tika-detectors/pom.xml                             |   1 +
 tika-detectors/tika-detector-magika/pom.xml        | 112 ++++++++
 .../apache/tika/detect/magika/MagikaDetector.java  | 298 +++++++++++++++++++++
 .../services/org.apache.tika.detect.Detector       |  16 ++
 .../tika/detect/magika/TestMagikaIntegration.java  |  59 ++++
 .../tika/detect/magika/TestMagikaJsonParsing.java  |  92 +++++++
 .../src/test/resources/configs/tika-config.xml     |  28 ++
 .../src/test/resources/json/test-basic.json        |  21 ++
 .../src/test/resources/test-documents/testPDF.pdf  | Bin 0 -> 34824 bytes
 10 files changed, 628 insertions(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index aa7fe028f..035b98fd5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -39,6 +39,7 @@
     <module>tika-bom</module>
     <module>tika-core</module>
     <module>tika-serialization</module>
+    <module>tika-detectors</module>
     <module>tika-parsers</module>
     <module>tika-bundles</module>
     <module>tika-xmp</module>
@@ -54,7 +55,6 @@
     <module>tika-translate</module>
     <module>tika-example</module>
     <module>tika-java7</module>
-    <module>tika-detectors</module>
     <module>tika-handlers</module>
   </modules>
 
diff --git a/tika-detectors/pom.xml b/tika-detectors/pom.xml
index 145719d7a..1e0981b53 100644
--- a/tika-detectors/pom.xml
+++ b/tika-detectors/pom.xml
@@ -35,5 +35,6 @@
 
   <modules>
     <module>tika-detector-siegfried</module>
+    <module>tika-detector-magika</module>
   </modules>
 </project>
\ No newline at end of file
diff --git a/tika-detectors/tika-detector-magika/pom.xml 
b/tika-detectors/tika-detector-magika/pom.xml
new file mode 100644
index 000000000..c4ba65f23
--- /dev/null
+++ b/tika-detectors/tika-detector-magika/pom.xml
@@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; 
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+  <parent>
+    <artifactId>tika-detectors</artifactId>
+    <groupId>org.apache.tika</groupId>
+    <version>4.0.0-SNAPSHOT</version>
+  </parent>
+  <modelVersion>4.0.0</modelVersion>
+
+  <artifactId>tika-detector-magika</artifactId>
+  <name>Apache Tika magika wrapper</name>
+
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-databind</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.logging.log4j</groupId>
+      <artifactId>log4j-core</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.logging.log4j</groupId>
+      <artifactId>log4j-slf4j2-impl</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  <build>
+    <plugins>
+      <plugin>
+        <artifactId>maven-shade-plugin</artifactId>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <createDependencyReducedPom>
+                false
+              </createDependencyReducedPom>
+              <filters>
+                <filter>
+                  <artifact>*:*</artifact>
+                  <excludes>
+                    <exclude>module-info.class</exclude>
+                    <exclude>module-info.class</exclude>
+                    <exclude>META-INF/*.SF</exclude>
+                    <exclude>META-INF/*.DSA</exclude>
+                    <exclude>META-INF/*.RSA</exclude>
+                    <exclude>META-INF/*.txt</exclude>
+                  </excludes>
+                </filter>
+              </filters>
+              <transformers>
+                <transformer 
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                  <manifestEntries>
+                    <Multi-Release>true</Multi-Release>
+                  </manifestEntries>
+                </transformer>
+              </transformers>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <configuration>
+          <archive>
+            <manifestEntries>
+              
<Automatic-Module-Name>org.apache.tika.detector.magika</Automatic-Module-Name>
+            </manifestEntries>
+          </archive>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
\ No newline at end of file
diff --git 
a/tika-detectors/tika-detector-magika/src/main/java/org/apache/tika/detect/magika/MagikaDetector.java
 
b/tika-detectors/tika-detector-magika/src/main/java/org/apache/tika/detect/magika/MagikaDetector.java
new file mode 100644
index 000000000..f3ed0b310
--- /dev/null
+++ 
b/tika-detectors/tika-detector-magika/src/main/java/org/apache/tika/detect/magika/MagikaDetector.java
@@ -0,0 +1,298 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect.magika;
+
+import static java.nio.file.StandardCopyOption.REPLACE_EXISTING;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.BoundedInputStream;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.ExternalProcess;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.utils.FileProcessResult;
+import org.apache.tika.utils.ProcessUtils;
+import org.apache.tika.utils.StringUtils;
+
+/**
+ * Simple wrapper around Google's magika: https://github.com/google/magika
+ * The tool must be installed on the host where Tika is running.
+ * The default behavior is to run detection, report the results in the
+ * metadata and then return null so that other detectors will be used.
+ */
+public class MagikaDetector implements Detector {
+
+    enum STATUS {
+        SUCCESS, TIMEOUT, CRASH, JSON_PARSE_EXCEPTION
+    }
+
+    public static final String MAGIKA_PREFIX = "magika:";
+
+    public static Property MAGIKA_STATUS = Property.externalText(MAGIKA_PREFIX 
+ "status");
+    public static Property MAGIKA_DESCRIPTION =
+            Property.externalText(MAGIKA_PREFIX + "description");
+    public static Property MAGIKA_SCORE =
+            Property.externalReal(MAGIKA_PREFIX + "score");
+    public static Property MAGIKA_GROUP =
+            Property.externalText(MAGIKA_PREFIX + "group");
+    public static Property MAGIKA_LABEL =
+            Property.externalText(MAGIKA_PREFIX + "label");
+    public static Property MAGIKA_MIME =
+            Property.externalText(MAGIKA_PREFIX + "mime_type");
+
+    public static Property MAGIKA_ERRORS =
+            Property.externalTextBag(MAGIKA_PREFIX + "errors");
+
+    public static Property MAGIKA_VERSION = 
Property.externalText(MAGIKA_PREFIX + "version");
+
+    //TODO -- grab errors and warnings
+
+    private static final Logger LOGGER = 
LoggerFactory.getLogger(MagikaDetector.class);
+    private static final long DEFAULT_TIMEOUT_MS = 60000;
+    private static final String DEFAULT_MAGIKA_PATH = "magika";
+
+    //we set this during the initial check.
+    //we assume that a new version is not installed during the lifecycle of 
the MagikaDetector
+    private static String MAGIKA_VERSION_STRING = "";
+
+    private static ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+    private static boolean HAS_WARNED = false;
+    private Boolean hasMagika = null;
+    private String magikaPath = DEFAULT_MAGIKA_PATH;
+    private int maxBytes = 1_000_000;
+    private long timeoutMs = DEFAULT_TIMEOUT_MS;
+
+    private boolean useMime = false;
+
+    public static boolean checkHasMagika(String magikaCommandPath) {
+        String[] commandline = new String[]{magikaCommandPath, "--version"};
+        FileProcessResult result = null;
+        try {
+            result = ProcessUtils.execute(new ProcessBuilder(commandline),
+                    1000, 1000, 1000);
+        } catch (IOException e) {
+            LOGGER.debug("problem with magika: " + result.getStderr());
+            return false;
+        }
+
+        if (result.getExitValue() != 0) {
+            return false;
+        }
+        Matcher m = Pattern
+                .compile("Magika version:\\s+(.{4,50})").matcher("");
+        for (String line : result.getStdout().split("[\r\n]+")) {
+            if (m.reset(line).find()) {
+                MAGIKA_VERSION_STRING = m.group(1);
+                break;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * @param input    document input stream, or <code>null</code>
+     * @param metadata input metadata for the document
+     * @return mime as identified by the file command or 
application/octet-stream otherwise
+     * @throws IOException
+     */
+    @Override
+    public MediaType detect(InputStream input, Metadata metadata) throws 
IOException {
+        if (hasMagika == null) {
+            hasMagika = checkHasMagika(this.magikaPath);
+        }
+        if (!hasMagika) {
+            if (!HAS_WARNED) {
+                LOGGER.warn("'magika' command isn't working: '" + magikaPath + 
"'");
+                HAS_WARNED = true;
+            }
+            return MediaType.OCTET_STREAM;
+        }
+        TikaInputStream tis = TikaInputStream.cast(input);
+        if (tis != null) {
+            //spool the full file to disk, if called with a TikaInputStream
+            //and there is no underlying file
+            return detectOnPath(tis.getPath(), metadata);
+        }
+
+        input.mark(maxBytes);
+        try (TemporaryResources tmp = new TemporaryResources()) {
+            Path tmpFile = tmp.createTempFile();
+            Files.copy(new BoundedInputStream(maxBytes, input), tmpFile, 
REPLACE_EXISTING);
+            return detectOnPath(tmpFile, metadata);
+        } finally {
+            input.reset();
+        }
+    }
+
+    /**
+     * As default behavior, Tika runs magika to add its detection
+     * to the metadata, but NOT to use detection in determining parsers
+     * etc.  If this is set to <code>true</code>, this detector
+     * will return the first mime detected by magika and that
+     * mime will be used by the AutoDetectParser to select the appropriate
+     * parser.
+     *
+     * @param useMime
+     */
+    @Field
+    public void setUseMime(boolean useMime) {
+        this.useMime = useMime;
+    }
+
+    public boolean isUseMime() {
+        return useMime;
+    }
+
+    private MediaType detectOnPath(Path path, Metadata metadata) throws 
IOException {
+
+        String[] args = new String[]{
+                ProcessUtils.escapeCommandLine(magikaPath),
+                
ProcessUtils.escapeCommandLine(path.toAbsolutePath().toString()),
+                "--json"
+        };
+        ProcessBuilder builder = new ProcessBuilder(args);
+        FileProcessResult result = ProcessUtils.execute(builder, timeoutMs, 
10000000, 1000);
+        return processResult(result, metadata, useMime);
+    }
+
+    protected static MediaType processResult(FileProcessResult result, 
Metadata metadata,
+                                             boolean returnMime) {
+        metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue());
+        metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout());
+
+        if (result.isTimeout()) {
+            metadata.set(MAGIKA_STATUS, STATUS.TIMEOUT.name());
+            return MediaType.OCTET_STREAM;
+        }
+        if (result.getExitValue() != 0) {
+            metadata.set(MAGIKA_STATUS, STATUS.CRASH.name());
+            return MediaType.OCTET_STREAM;
+        }
+        JsonNode rootArray;
+        try {
+            rootArray = OBJECT_MAPPER.readTree(result.getStdout());
+        } catch (JsonProcessingException e) {
+            metadata.set(MAGIKA_STATUS, STATUS.JSON_PARSE_EXCEPTION.name());
+            return MediaType.OCTET_STREAM;
+        }
+        if (! rootArray.isArray() || rootArray.isEmpty()) {
+            //something went wrong
+            return MediaType.OCTET_STREAM;
+        }
+        //for now just take the first value
+        JsonNode root = rootArray.get(0);
+        metadata.set(MAGIKA_STATUS, "ok");
+        //TODO -- should we get values in "dl" instead or in addition?
+
+        if (! root.has("output")) {
+            //do something else
+            return MediaType.OCTET_STREAM;
+        }
+        JsonNode mOutput = root.get("output");
+        if (mOutput.has("score")) {
+            double score = mOutput.get("score").asDouble(-1.0);
+            metadata.set(MAGIKA_SCORE, score);
+        }
+        addString(mOutput, "description", MAGIKA_DESCRIPTION, metadata);
+        addString(mOutput, "group", MAGIKA_GROUP, metadata);
+        addString(mOutput, "ct_label", MAGIKA_LABEL, metadata);
+        addString(mOutput, "mime_type", MAGIKA_MIME, metadata);
+        metadata.set(MAGIKA_VERSION, MAGIKA_VERSION_STRING);
+        if (returnMime && ! StringUtils.isBlank(metadata.get(MAGIKA_MIME))) {
+            return MediaType.parse(metadata.get(MAGIKA_MIME));
+        }
+
+        return MediaType.OCTET_STREAM;
+    }
+
+    private static void setBoolean(JsonNode node, String jsonKey, Property 
property,
+                                   Metadata metadata) {
+        if (! node.has(jsonKey)) {
+            return;
+        }
+        if (! node.get(jsonKey).isBoolean()) {
+            //log?
+            return;
+        }
+        metadata.set(property, node.get(jsonKey).booleanValue());
+
+    }
+
+    private static void addString(JsonNode node, String jsonKey, Property 
property,
+                                  Metadata metadata) {
+        if (node.has(jsonKey)) {
+            if (node.get(jsonKey).isArray()) {
+                for (JsonNode child : node.get(jsonKey)) {
+                    String val = child
+                            .asText(StringUtils.EMPTY);
+                    if (! StringUtils.isBlank(val)) {
+                        metadata.add(property, val);
+                    }
+                }
+            } else {
+                String val = node
+                        .get(jsonKey)
+                        .asText(StringUtils.EMPTY);
+                if (StringUtils.isBlank(val)) {
+                    return;
+                }
+                metadata.set(property, val);
+            }
+        }
+    }
+
+    @Field
+    public void setMagikaPath(String fileCommandPath) {
+        //this opens up a potential command vulnerability.
+        //Don't ever let an untrusted user set this.
+        this.magikaPath = fileCommandPath;
+        checkHasMagika(this.magikaPath);
+    }
+
+    /**
+     * If this is not called on a TikaInputStream, this detector
+     * will spool up to this many bytes to a file to be detected
+     * by the 'file' command.
+     *
+     * @param maxBytes
+     */
+    @Field
+    public void setMaxBytes(int maxBytes) {
+        this.maxBytes = maxBytes;
+    }
+
+    @Field
+    public void setTimeoutMs(long timeoutMs) {
+        this.timeoutMs = timeoutMs;
+    }
+}
diff --git 
a/tika-detectors/tika-detector-magika/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
 
b/tika-detectors/tika-detector-magika/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
new file mode 100644
index 000000000..380301a67
--- /dev/null
+++ 
b/tika-detectors/tika-detector-magika/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.detect.magika.MagikaDetector
\ No newline at end of file
diff --git 
a/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaIntegration.java
 
b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaIntegration.java
new file mode 100644
index 000000000..f00e020fa
--- /dev/null
+++ 
b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaIntegration.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect.magika;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.net.URISyntaxException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.List;
+
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+
+@Disabled("need to have magika on the path")
+public class TestMagikaIntegration extends TikaTest {
+
+    @Test
+    public void testIntegration() throws Exception {
+        TikaConfig tikaConfig = new TikaConfig(getConfig("tika-config.xml"));
+        Parser p = new AutoDetectParser(tikaConfig);
+        List<Metadata> metadataList = getRecursiveMetadata("testPDF.pdf", p);
+        debug(getRecursiveMetadata("testPDF.pdf", p));
+        Metadata m = metadataList.get(0);
+        assertEquals("PDF document", m.get(MagikaDetector.MAGIKA_DESCRIPTION));
+        assertEquals(1.0, 
Double.parseDouble(m.get(MagikaDetector.MAGIKA_SCORE)), 0.000001);
+        assertEquals("document", m.get(MagikaDetector.MAGIKA_GROUP));
+        assertEquals("0.5.1", m.get(MagikaDetector.MAGIKA_VERSION));
+        assertEquals("application/pdf", m.get(MagikaDetector.MAGIKA_MIME));
+        assertEquals("application/pdf", m.get(Metadata.CONTENT_TYPE));
+        assertEquals("ok", m.get(MagikaDetector.MAGIKA_STATUS));
+    }
+
+    private Path getConfig(String configName) throws URISyntaxException {
+        return Paths.get(
+                getClass().getResource("/configs/" + configName).toURI());
+    }
+
+}
diff --git 
a/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaJsonParsing.java
 
b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaJsonParsing.java
new file mode 100644
index 000000000..ef71d6656
--- /dev/null
+++ 
b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaJsonParsing.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect.magika;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.commons.io.IOUtils;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.utils.FileProcessResult;
+
+public class TestMagikaJsonParsing extends TikaTest {
+
+    //TODO -- add testcontainers unit test with dockerized magika
+
+    @Test
+    public void testBasic() throws Exception {
+        FileProcessResult fileProcessResult = load("test-basic.json");
+        Metadata metadata = new Metadata();
+        MagikaDetector.processResult(fileProcessResult, metadata, false);
+        assertEquals("ok", metadata.get(MagikaDetector.MAGIKA_STATUS));
+        assertEquals("Python source", 
metadata.get(MagikaDetector.MAGIKA_DESCRIPTION));
+        assertEquals(0.999987125396, 
Double.parseDouble(metadata.get(MagikaDetector.MAGIKA_SCORE)), 0.0000001);
+        assertEquals("code", metadata.get(MagikaDetector.MAGIKA_GROUP));
+        assertEquals("python", metadata.get(MagikaDetector.MAGIKA_LABEL));
+        assertEquals("text/x-python", 
metadata.get(MagikaDetector.MAGIKA_MIME));
+    }
+/*
+    @Test
+    public void testErrors() throws Exception {
+        FileProcessResult fileProcessResult = load("test-errors.json");
+        Metadata metadata = new Metadata();
+        SiegfriedDetector.processResult(fileProcessResult, metadata, false);
+        //debug(metadata);
+        assertEquals("1.9.5", 
metadata.get(SiegfriedDetector.SIEGFRIED_VERSION));
+        assertEquals("default.sig", 
metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE));
+        assertEquals("x-fmt/111", metadata.get("sf:pronom:id"));
+        assertEquals("extension match txt", metadata.get("sf:pronom:basis"));
+        assertEquals("Plain Text File", metadata.get("sf:pronom:format"));
+        assertEquals("text/plain", metadata.get("sf:pronom:mime"));
+        assertNull(metadata.get("sf:pronom:version"));
+        assertEquals("empty source", 
metadata.get(SiegfriedDetector.SIEGFRIED_ERRORS));
+    }
+
+    @Test
+    public void testWarnings() throws Exception {
+        FileProcessResult fileProcessResult = load("test-warnings.json");
+        Metadata metadata = new Metadata();
+        SiegfriedDetector.processResult(fileProcessResult, metadata, false);
+        assertEquals("1.9.5", 
metadata.get(SiegfriedDetector.SIEGFRIED_VERSION));
+        assertEquals("default.sig", 
metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE));
+        assertEquals("UNKNOWN", metadata.get("sf:pronom:id"));
+        assertNull(metadata.get("sf:pronom:basis"));
+        assertNull(metadata.get("sf:pronom:format"));
+        assertNull(metadata.get("sf:pronom:mime"));
+        assertNull(metadata.get("sf:pronom:version"));
+        assertTrue(metadata.get("sf:pronom:warning")
+                .startsWith("no match; possibilities based on extension are 
fmt/14, fmt/15, fmt/16, " +
+                        "fmt/17, fmt/18, fmt/19"));
+    }
+
+
+*/
+
+    private FileProcessResult load(String jsonFileName) throws IOException {
+        String jsonString = IOUtils.toString(
+                getClass().getResourceAsStream("/json/" + jsonFileName), 
StandardCharsets.UTF_8);
+        FileProcessResult r = new FileProcessResult();
+        r.setStdout(jsonString);
+        r.setExitValue(0);
+        return r;
+    }
+}
diff --git 
a/tika-detectors/tika-detector-magika/src/test/resources/configs/tika-config.xml
 
b/tika-detectors/tika-detector-magika/src/test/resources/configs/tika-config.xml
new file mode 100644
index 000000000..6bb062161
--- /dev/null
+++ 
b/tika-detectors/tika-detector-magika/src/test/resources/configs/tika-config.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <detectors>
+    <detector class="org.apache.tika.detect.DefaultDetector"/>
+    <detector class="org.apache.tika.detect.magika.MagikaDetector">
+      <params>
+        <param name="magikaPath" type="string">magika</param> <!-- or replace 
with full path to the commandline -->
+        <param name="useMime" type="bool">true</param>
+      </params>
+    </detector>
+  </detectors>
+</properties>
diff --git 
a/tika-detectors/tika-detector-magika/src/test/resources/json/test-basic.json 
b/tika-detectors/tika-detector-magika/src/test/resources/json/test-basic.json
new file mode 100644
index 000000000..e8aa5366e
--- /dev/null
+++ 
b/tika-detectors/tika-detector-magika/src/test/resources/json/test-basic.json
@@ -0,0 +1,21 @@
+[
+  {
+    "path": "...im2txtapi.py",
+    "dl": {
+      "ct_label": "python",
+      "score": 0.9999871253967285,
+      "group": "code",
+      "mime_type": "text/x-python",
+      "magic": "Python script",
+      "description": "Python source"
+    },
+    "output": {
+      "ct_label": "python",
+      "score": 0.9999871253967285,
+      "group": "code",
+      "mime_type": "text/x-python",
+      "magic": "Python script",
+      "description": "Python source"
+    }
+  }
+]
diff --git 
a/tika-detectors/tika-detector-magika/src/test/resources/test-documents/testPDF.pdf
 
b/tika-detectors/tika-detector-magika/src/test/resources/test-documents/testPDF.pdf
new file mode 100644
index 000000000..1f1bcff6f
Binary files /dev/null and 
b/tika-detectors/tika-detector-magika/src/test/resources/test-documents/testPDF.pdf
 differ

Reply via email to