This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4344 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6f2bdeb6a435762a6fa8e1d4b61960f8129e1e2b Author: tallison <[email protected]> AuthorDate: Tue Nov 5 11:26:00 2024 -0500 TIKA-4344 -- add a magika wrapper --- pom.xml | 2 +- tika-detectors/pom.xml | 1 + tika-detectors/tika-detector-magika/pom.xml | 112 ++++++++ .../apache/tika/detect/magika/MagikaDetector.java | 298 +++++++++++++++++++++ .../services/org.apache.tika.detect.Detector | 16 ++ .../tika/detect/magika/TestMagikaIntegration.java | 59 ++++ .../tika/detect/magika/TestMagikaJsonParsing.java | 92 +++++++ .../src/test/resources/configs/tika-config.xml | 28 ++ .../src/test/resources/json/test-basic.json | 21 ++ .../src/test/resources/test-documents/testPDF.pdf | Bin 0 -> 34824 bytes 10 files changed, 628 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index aa7fe028f..035b98fd5 100644 --- a/pom.xml +++ b/pom.xml @@ -39,6 +39,7 @@ <module>tika-bom</module> <module>tika-core</module> <module>tika-serialization</module> + <module>tika-detectors</module> <module>tika-parsers</module> <module>tika-bundles</module> <module>tika-xmp</module> @@ -54,7 +55,6 @@ <module>tika-translate</module> <module>tika-example</module> <module>tika-java7</module> - <module>tika-detectors</module> <module>tika-handlers</module> </modules> diff --git a/tika-detectors/pom.xml b/tika-detectors/pom.xml index 145719d7a..1e0981b53 100644 --- a/tika-detectors/pom.xml +++ b/tika-detectors/pom.xml @@ -35,5 +35,6 @@ <modules> <module>tika-detector-siegfried</module> + <module>tika-detector-magika</module> </modules> </project> \ No newline at end of file diff --git a/tika-detectors/tika-detector-magika/pom.xml b/tika-detectors/tika-detector-magika/pom.xml new file mode 100644 index 000000000..c4ba65f23 --- /dev/null +++ b/tika-detectors/tika-detector-magika/pom.xml @@ -0,0 +1,112 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <artifactId>tika-detectors</artifactId> + <groupId>org.apache.tika</groupId> + <version>4.0.0-SNAPSHOT</version> + </parent> + <modelVersion>4.0.0</modelVersion> + + <artifactId>tika-detector-magika</artifactId> + <name>Apache Tika magika wrapper</name> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-core</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-slf4j2-impl</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + </dependencies> + <build> + <plugins> + <plugin> + <artifactId>maven-shade-plugin</artifactId> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <createDependencyReducedPom> + false + </createDependencyReducedPom> + <filters> + <filter> + <artifact>*:*</artifact> + <excludes> + <exclude>module-info.class</exclude> + <exclude>module-info.class</exclude> + <exclude>META-INF/*.SF</exclude> + <exclude>META-INF/*.DSA</exclude> + <exclude>META-INF/*.RSA</exclude> + <exclude>META-INF/*.txt</exclude> + </excludes> + </filter> + </filters> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> + <manifestEntries> + <Multi-Release>true</Multi-Release> + </manifestEntries> + </transformer> + </transformers> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <configuration> + <archive> + <manifestEntries> + <Automatic-Module-Name>org.apache.tika.detector.magika</Automatic-Module-Name> + </manifestEntries> + </archive> + </configuration> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file diff --git a/tika-detectors/tika-detector-magika/src/main/java/org/apache/tika/detect/magika/MagikaDetector.java b/tika-detectors/tika-detector-magika/src/main/java/org/apache/tika/detect/magika/MagikaDetector.java new file mode 100644 index 000000000..f3ed0b310 --- /dev/null +++ b/tika-detectors/tika-detector-magika/src/main/java/org/apache/tika/detect/magika/MagikaDetector.java @@ -0,0 +1,298 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect.magika; + +import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.config.Field; +import org.apache.tika.detect.Detector; +import org.apache.tika.io.BoundedInputStream; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.ExternalProcess; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.mime.MediaType; +import org.apache.tika.utils.FileProcessResult; +import org.apache.tika.utils.ProcessUtils; +import org.apache.tika.utils.StringUtils; + +/** + * Simple wrapper around Google's magika: https://github.com/google/magika + * The tool must be installed on the host where Tika is running. + * The default behavior is to run detection, report the results in the + * metadata and then return null so that other detectors will be used. + */ +public class MagikaDetector implements Detector { + + enum STATUS { + SUCCESS, TIMEOUT, CRASH, JSON_PARSE_EXCEPTION + } + + public static final String MAGIKA_PREFIX = "magika:"; + + public static Property MAGIKA_STATUS = Property.externalText(MAGIKA_PREFIX + "status"); + public static Property MAGIKA_DESCRIPTION = + Property.externalText(MAGIKA_PREFIX + "description"); + public static Property MAGIKA_SCORE = + Property.externalReal(MAGIKA_PREFIX + "score"); + public static Property MAGIKA_GROUP = + Property.externalText(MAGIKA_PREFIX + "group"); + public static Property MAGIKA_LABEL = + Property.externalText(MAGIKA_PREFIX + "label"); + public static Property MAGIKA_MIME = + Property.externalText(MAGIKA_PREFIX + "mime_type"); + + public static Property MAGIKA_ERRORS = + Property.externalTextBag(MAGIKA_PREFIX + "errors"); + + public static Property MAGIKA_VERSION = Property.externalText(MAGIKA_PREFIX + "version"); + + //TODO -- grab errors and warnings + + private static final Logger LOGGER = LoggerFactory.getLogger(MagikaDetector.class); + private static final long DEFAULT_TIMEOUT_MS = 60000; + private static final String DEFAULT_MAGIKA_PATH = "magika"; + + //we set this during the initial check. + //we assume that a new version is not installed during the lifecycle of the MagikaDetector + private static String MAGIKA_VERSION_STRING = ""; + + private static ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static boolean HAS_WARNED = false; + private Boolean hasMagika = null; + private String magikaPath = DEFAULT_MAGIKA_PATH; + private int maxBytes = 1_000_000; + private long timeoutMs = DEFAULT_TIMEOUT_MS; + + private boolean useMime = false; + + public static boolean checkHasMagika(String magikaCommandPath) { + String[] commandline = new String[]{magikaCommandPath, "--version"}; + FileProcessResult result = null; + try { + result = ProcessUtils.execute(new ProcessBuilder(commandline), + 1000, 1000, 1000); + } catch (IOException e) { + LOGGER.debug("problem with magika: " + result.getStderr()); + return false; + } + + if (result.getExitValue() != 0) { + return false; + } + Matcher m = Pattern + .compile("Magika version:\\s+(.{4,50})").matcher(""); + for (String line : result.getStdout().split("[\r\n]+")) { + if (m.reset(line).find()) { + MAGIKA_VERSION_STRING = m.group(1); + break; + } + } + return true; + } + + /** + * @param input document input stream, or <code>null</code> + * @param metadata input metadata for the document + * @return mime as identified by the file command or application/octet-stream otherwise + * @throws IOException + */ + @Override + public MediaType detect(InputStream input, Metadata metadata) throws IOException { + if (hasMagika == null) { + hasMagika = checkHasMagika(this.magikaPath); + } + if (!hasMagika) { + if (!HAS_WARNED) { + LOGGER.warn("'magika' command isn't working: '" + magikaPath + "'"); + HAS_WARNED = true; + } + return MediaType.OCTET_STREAM; + } + TikaInputStream tis = TikaInputStream.cast(input); + if (tis != null) { + //spool the full file to disk, if called with a TikaInputStream + //and there is no underlying file + return detectOnPath(tis.getPath(), metadata); + } + + input.mark(maxBytes); + try (TemporaryResources tmp = new TemporaryResources()) { + Path tmpFile = tmp.createTempFile(); + Files.copy(new BoundedInputStream(maxBytes, input), tmpFile, REPLACE_EXISTING); + return detectOnPath(tmpFile, metadata); + } finally { + input.reset(); + } + } + + /** + * As default behavior, Tika runs magika to add its detection + * to the metadata, but NOT to use detection in determining parsers + * etc. If this is set to <code>true</code>, this detector + * will return the first mime detected by magika and that + * mime will be used by the AutoDetectParser to select the appropriate + * parser. + * + * @param useMime + */ + @Field + public void setUseMime(boolean useMime) { + this.useMime = useMime; + } + + public boolean isUseMime() { + return useMime; + } + + private MediaType detectOnPath(Path path, Metadata metadata) throws IOException { + + String[] args = new String[]{ + ProcessUtils.escapeCommandLine(magikaPath), + ProcessUtils.escapeCommandLine(path.toAbsolutePath().toString()), + "--json" + }; + ProcessBuilder builder = new ProcessBuilder(args); + FileProcessResult result = ProcessUtils.execute(builder, timeoutMs, 10000000, 1000); + return processResult(result, metadata, useMime); + } + + protected static MediaType processResult(FileProcessResult result, Metadata metadata, + boolean returnMime) { + metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue()); + metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout()); + + if (result.isTimeout()) { + metadata.set(MAGIKA_STATUS, STATUS.TIMEOUT.name()); + return MediaType.OCTET_STREAM; + } + if (result.getExitValue() != 0) { + metadata.set(MAGIKA_STATUS, STATUS.CRASH.name()); + return MediaType.OCTET_STREAM; + } + JsonNode rootArray; + try { + rootArray = OBJECT_MAPPER.readTree(result.getStdout()); + } catch (JsonProcessingException e) { + metadata.set(MAGIKA_STATUS, STATUS.JSON_PARSE_EXCEPTION.name()); + return MediaType.OCTET_STREAM; + } + if (! rootArray.isArray() || rootArray.isEmpty()) { + //something went wrong + return MediaType.OCTET_STREAM; + } + //for now just take the first value + JsonNode root = rootArray.get(0); + metadata.set(MAGIKA_STATUS, "ok"); + //TODO -- should we get values in "dl" instead or in addition? + + if (! root.has("output")) { + //do something else + return MediaType.OCTET_STREAM; + } + JsonNode mOutput = root.get("output"); + if (mOutput.has("score")) { + double score = mOutput.get("score").asDouble(-1.0); + metadata.set(MAGIKA_SCORE, score); + } + addString(mOutput, "description", MAGIKA_DESCRIPTION, metadata); + addString(mOutput, "group", MAGIKA_GROUP, metadata); + addString(mOutput, "ct_label", MAGIKA_LABEL, metadata); + addString(mOutput, "mime_type", MAGIKA_MIME, metadata); + metadata.set(MAGIKA_VERSION, MAGIKA_VERSION_STRING); + if (returnMime && ! StringUtils.isBlank(metadata.get(MAGIKA_MIME))) { + return MediaType.parse(metadata.get(MAGIKA_MIME)); + } + + return MediaType.OCTET_STREAM; + } + + private static void setBoolean(JsonNode node, String jsonKey, Property property, + Metadata metadata) { + if (! node.has(jsonKey)) { + return; + } + if (! node.get(jsonKey).isBoolean()) { + //log? + return; + } + metadata.set(property, node.get(jsonKey).booleanValue()); + + } + + private static void addString(JsonNode node, String jsonKey, Property property, + Metadata metadata) { + if (node.has(jsonKey)) { + if (node.get(jsonKey).isArray()) { + for (JsonNode child : node.get(jsonKey)) { + String val = child + .asText(StringUtils.EMPTY); + if (! StringUtils.isBlank(val)) { + metadata.add(property, val); + } + } + } else { + String val = node + .get(jsonKey) + .asText(StringUtils.EMPTY); + if (StringUtils.isBlank(val)) { + return; + } + metadata.set(property, val); + } + } + } + + @Field + public void setMagikaPath(String fileCommandPath) { + //this opens up a potential command vulnerability. + //Don't ever let an untrusted user set this. + this.magikaPath = fileCommandPath; + checkHasMagika(this.magikaPath); + } + + /** + * If this is not called on a TikaInputStream, this detector + * will spool up to this many bytes to a file to be detected + * by the 'file' command. + * + * @param maxBytes + */ + @Field + public void setMaxBytes(int maxBytes) { + this.maxBytes = maxBytes; + } + + @Field + public void setTimeoutMs(long timeoutMs) { + this.timeoutMs = timeoutMs; + } +} diff --git a/tika-detectors/tika-detector-magika/src/main/resources/META-INF/services/org.apache.tika.detect.Detector b/tika-detectors/tika-detector-magika/src/main/resources/META-INF/services/org.apache.tika.detect.Detector new file mode 100644 index 000000000..380301a67 --- /dev/null +++ b/tika-detectors/tika-detector-magika/src/main/resources/META-INF/services/org.apache.tika.detect.Detector @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.tika.detect.magika.MagikaDetector \ No newline at end of file diff --git a/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaIntegration.java b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaIntegration.java new file mode 100644 index 000000000..f00e020fa --- /dev/null +++ b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaIntegration.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect.magika; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.net.URISyntaxException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; + +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; + +@Disabled("need to have magika on the path") +public class TestMagikaIntegration extends TikaTest { + + @Test + public void testIntegration() throws Exception { + TikaConfig tikaConfig = new TikaConfig(getConfig("tika-config.xml")); + Parser p = new AutoDetectParser(tikaConfig); + List<Metadata> metadataList = getRecursiveMetadata("testPDF.pdf", p); + debug(getRecursiveMetadata("testPDF.pdf", p)); + Metadata m = metadataList.get(0); + assertEquals("PDF document", m.get(MagikaDetector.MAGIKA_DESCRIPTION)); + assertEquals(1.0, Double.parseDouble(m.get(MagikaDetector.MAGIKA_SCORE)), 0.000001); + assertEquals("document", m.get(MagikaDetector.MAGIKA_GROUP)); + assertEquals("0.5.1", m.get(MagikaDetector.MAGIKA_VERSION)); + assertEquals("application/pdf", m.get(MagikaDetector.MAGIKA_MIME)); + assertEquals("application/pdf", m.get(Metadata.CONTENT_TYPE)); + assertEquals("ok", m.get(MagikaDetector.MAGIKA_STATUS)); + } + + private Path getConfig(String configName) throws URISyntaxException { + return Paths.get( + getClass().getResource("/configs/" + configName).toURI()); + } + +} diff --git a/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaJsonParsing.java b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaJsonParsing.java new file mode 100644 index 000000000..ef71d6656 --- /dev/null +++ b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaJsonParsing.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect.magika; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.utils.FileProcessResult; + +public class TestMagikaJsonParsing extends TikaTest { + + //TODO -- add testcontainers unit test with dockerized magika + + @Test + public void testBasic() throws Exception { + FileProcessResult fileProcessResult = load("test-basic.json"); + Metadata metadata = new Metadata(); + MagikaDetector.processResult(fileProcessResult, metadata, false); + assertEquals("ok", metadata.get(MagikaDetector.MAGIKA_STATUS)); + assertEquals("Python source", metadata.get(MagikaDetector.MAGIKA_DESCRIPTION)); + assertEquals(0.999987125396, Double.parseDouble(metadata.get(MagikaDetector.MAGIKA_SCORE)), 0.0000001); + assertEquals("code", metadata.get(MagikaDetector.MAGIKA_GROUP)); + assertEquals("python", metadata.get(MagikaDetector.MAGIKA_LABEL)); + assertEquals("text/x-python", metadata.get(MagikaDetector.MAGIKA_MIME)); + } +/* + @Test + public void testErrors() throws Exception { + FileProcessResult fileProcessResult = load("test-errors.json"); + Metadata metadata = new Metadata(); + SiegfriedDetector.processResult(fileProcessResult, metadata, false); + //debug(metadata); + assertEquals("1.9.5", metadata.get(SiegfriedDetector.SIEGFRIED_VERSION)); + assertEquals("default.sig", metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE)); + assertEquals("x-fmt/111", metadata.get("sf:pronom:id")); + assertEquals("extension match txt", metadata.get("sf:pronom:basis")); + assertEquals("Plain Text File", metadata.get("sf:pronom:format")); + assertEquals("text/plain", metadata.get("sf:pronom:mime")); + assertNull(metadata.get("sf:pronom:version")); + assertEquals("empty source", metadata.get(SiegfriedDetector.SIEGFRIED_ERRORS)); + } + + @Test + public void testWarnings() throws Exception { + FileProcessResult fileProcessResult = load("test-warnings.json"); + Metadata metadata = new Metadata(); + SiegfriedDetector.processResult(fileProcessResult, metadata, false); + assertEquals("1.9.5", metadata.get(SiegfriedDetector.SIEGFRIED_VERSION)); + assertEquals("default.sig", metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE)); + assertEquals("UNKNOWN", metadata.get("sf:pronom:id")); + assertNull(metadata.get("sf:pronom:basis")); + assertNull(metadata.get("sf:pronom:format")); + assertNull(metadata.get("sf:pronom:mime")); + assertNull(metadata.get("sf:pronom:version")); + assertTrue(metadata.get("sf:pronom:warning") + .startsWith("no match; possibilities based on extension are fmt/14, fmt/15, fmt/16, " + + "fmt/17, fmt/18, fmt/19")); + } + + +*/ + + private FileProcessResult load(String jsonFileName) throws IOException { + String jsonString = IOUtils.toString( + getClass().getResourceAsStream("/json/" + jsonFileName), StandardCharsets.UTF_8); + FileProcessResult r = new FileProcessResult(); + r.setStdout(jsonString); + r.setExitValue(0); + return r; + } +} diff --git a/tika-detectors/tika-detector-magika/src/test/resources/configs/tika-config.xml b/tika-detectors/tika-detector-magika/src/test/resources/configs/tika-config.xml new file mode 100644 index 000000000..6bb062161 --- /dev/null +++ b/tika-detectors/tika-detector-magika/src/test/resources/configs/tika-config.xml @@ -0,0 +1,28 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <detectors> + <detector class="org.apache.tika.detect.DefaultDetector"/> + <detector class="org.apache.tika.detect.magika.MagikaDetector"> + <params> + <param name="magikaPath" type="string">magika</param> <!-- or replace with full path to the commandline --> + <param name="useMime" type="bool">true</param> + </params> + </detector> + </detectors> +</properties> diff --git a/tika-detectors/tika-detector-magika/src/test/resources/json/test-basic.json b/tika-detectors/tika-detector-magika/src/test/resources/json/test-basic.json new file mode 100644 index 000000000..e8aa5366e --- /dev/null +++ b/tika-detectors/tika-detector-magika/src/test/resources/json/test-basic.json @@ -0,0 +1,21 @@ +[ + { + "path": "...im2txtapi.py", + "dl": { + "ct_label": "python", + "score": 0.9999871253967285, + "group": "code", + "mime_type": "text/x-python", + "magic": "Python script", + "description": "Python source" + }, + "output": { + "ct_label": "python", + "score": 0.9999871253967285, + "group": "code", + "mime_type": "text/x-python", + "magic": "Python script", + "description": "Python source" + } + } +] diff --git a/tika-detectors/tika-detector-magika/src/test/resources/test-documents/testPDF.pdf b/tika-detectors/tika-detector-magika/src/test/resources/test-documents/testPDF.pdf new file mode 100644 index 000000000..1f1bcff6f Binary files /dev/null and b/tika-detectors/tika-detector-magika/src/test/resources/test-documents/testPDF.pdf differ
