This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 69fd2c6b18d9eaa4b5977620bc8bf2d572cf456e Author: tallison <[email protected]> AuthorDate: Thu Oct 29 10:57:03 2020 -0400 TIKA-3215 -- add a wrapper for the commandline linux file command as a detector # Conflicts: # tika-parsers/src/test/java/org/apache/tika/detect/TestFileCommandDetector.java --- .../apache/tika/detect/FileCommandDetector.java | 207 +++++++++++++++++++++ .../tika/detect/FileCommandDetectorTest.java | 44 +++++ .../org/apache/tika/config/FileCommandDetector.xml | 22 +++ .../tika/detect/TestFileCommandDetector.java | 59 ++++++ 4 files changed, 332 insertions(+) diff --git a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java new file mode 100644 index 0000000..b68991d --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect; + +import org.apache.tika.config.Field; +import org.apache.tika.io.BoundedInputStream; +import org.apache.tika.io.IOExceptionWithCause; +import org.apache.tika.io.IOUtils; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.external.ExternalParser; +import org.apache.tika.utils.ProcessUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; + +/** + * This runs the linux 'file' command against a file. If + * this is called on a TikaInputStream, it will use the underlying Path + * or spool the full file to disk and then run file against that. + * + * If this is run against any other type of InputStream, it will spool + * up to {@link #maxBytes} to disk and then run the detector. + * + * As with all detectors, mark must be supported. + */ +public class FileCommandDetector implements Detector { + + //TODO: file has some diff mimes names for some very common mimes + //should we map file mimes to Tika mimes, e.g. text/xml -> application/xml?? + + private static final Logger LOGGER = LoggerFactory.getLogger(FileCommandDetector.class); + private static boolean HAS_WARNED = false; + private static final long DEFAULT_TIMEOUT_MS = 6000; + private static String DEFAULT_FILE_COMMAND_PATH = "file"; + private Boolean hasFileCommand = null; + private String fileCommandPath = DEFAULT_FILE_COMMAND_PATH; + private int maxBytes = 1_000_000; + private long timeoutMs = DEFAULT_TIMEOUT_MS; + + static boolean checkHasFile() { + return checkHasFile(DEFAULT_FILE_COMMAND_PATH); + } + + + static boolean checkHasFile(String fileCommandPath) { + String[] commandline = new String[]{ + fileCommandPath, "-v" + }; + return ExternalParser.check(commandline); + } + + @Override + public MediaType detect(InputStream input, Metadata metadata) throws IOException { + if (hasFileCommand == null) { + hasFileCommand = checkHasFile(this.fileCommandPath); + } + if (!hasFileCommand) { + if (! HAS_WARNED) { + LOGGER.warn("'file' command isn't working: '"+fileCommandPath+"'"); + HAS_WARNED = true; + } + return MediaType.OCTET_STREAM; + } + TikaInputStream tis = TikaInputStream.cast(input); + if (tis != null) { + //spool the full file to disk, if called with a TikaInputStream + //and there is no underlying file + return detectOnPath(tis.getPath()); + } + + input.mark(maxBytes); + TemporaryResources tmp = new TemporaryResources(); + try { + Path tmpFile = tmp.createTempFile(); + Files.copy(new BoundedInputStream(maxBytes, input), tmpFile, REPLACE_EXISTING); + return detectOnPath(tmpFile); + } finally { + tmp.close(); + input.reset(); + } + } + + private MediaType detectOnPath(Path path) throws IOException { + + String[] args = new String[]{ + ProcessUtils.escapeCommandLine(fileCommandPath), + "-b", "--mime-type", + ProcessUtils.escapeCommandLine(path.toAbsolutePath().toString()) + }; + ProcessBuilder builder = new ProcessBuilder(args); + Process process = builder.start(); + StringStreamGobbler errorGobbler = new StringStreamGobbler(process.getErrorStream()); + StringStreamGobbler outGobbler = new StringStreamGobbler(process.getInputStream()); + Thread errorThread = new Thread(errorGobbler); + Thread outThread = new Thread(outGobbler); + errorThread.start(); + outThread.start(); + + process.getErrorStream(); + process.getInputStream(); + + boolean finished = false; + try { + finished = process.waitFor(timeoutMs, TimeUnit.MILLISECONDS); + if (!finished) { + process.destroyForcibly(); + throw new IOExceptionWithCause(new TimeoutException("timed out")); + } + int exitValue = process.exitValue(); + if (exitValue != 0) { + throw new IOExceptionWithCause(new RuntimeException("bad exit value")); + } + errorThread.join(); + outThread.join(); + } catch (InterruptedException e) { + } + return MediaType.parse(outGobbler.toString().trim()); + } + + @Field + public void setFilePath(String fileCommandPath) { + //this opens up a potential command vulnerability. + //Don't ever let an untrusted user set this. + this.fileCommandPath = fileCommandPath; + checkHasFile(this.fileCommandPath); + } + + /** + * If this is not called on a TikaInputStream, this detector + * will spool up to this many bytes to a file to be detected + * by the 'file' command. + * + * @param maxBytes + */ + @Field + public void setMaxBytes(int maxBytes) { + this.maxBytes = maxBytes; + } + + @Field + public void setTimeoutMs(long timeoutMs) { + this.timeoutMs = timeoutMs; + } + + private static class StringStreamGobbler implements Runnable { + + //plagiarized from org.apache.oodt's StreamGobbler + private final BufferedReader reader; + private final StringBuilder sb = new StringBuilder(); + + public StringStreamGobbler(InputStream is) { + this.reader = new BufferedReader(new InputStreamReader(new BufferedInputStream(is), UTF_8)); + } + + @Override + public void run() { + String line = null; + try { + while ((line = reader.readLine()) != null) { + sb.append(line); + sb.append("\n"); + } + } catch (IOException e) { + //swallow ioe + } + } + + public void stopGobblingAndDie() { + IOUtils.closeQuietly(reader); + } + + @Override + public String toString() { + return sb.toString(); + } + + } +} diff --git a/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java new file mode 100644 index 0000000..b5d49ec --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java @@ -0,0 +1,44 @@ +package org.apache.tika.detect; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.InputStream; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assume.assumeTrue; + +public class FileCommandDetectorTest { + + private static Detector DETECTOR; + + @BeforeClass + public static void setUp() throws Exception { + try (InputStream is = TikaConfig.class.getResourceAsStream("FileCommandDetector.xml")) { + DETECTOR = new TikaConfig(is).getDetector(); + } + } + + @Test + public void testBasic() throws Exception { + assumeTrue(FileCommandDetector.checkHasFile()); + + try (InputStream is = getClass().getResourceAsStream("/test-documents/basic_embedded.xml")) { + assertEquals(MediaType.text("xml"), DETECTOR.detect(is, new Metadata())); + //make sure that the detector is resetting the stream + assertEquals(MediaType.text("xml"), DETECTOR.detect(is, new Metadata())); + } + + //now try with TikaInputStream + try (InputStream is = TikaInputStream.get(getClass() + .getResourceAsStream("/test-documents/basic_embedded.xml"))) { + assertEquals(MediaType.text("xml"), DETECTOR.detect(is, new Metadata())); + //make sure that the detector is resetting the stream + assertEquals(MediaType.text("xml"), DETECTOR.detect(is, new Metadata())); + } + } +} diff --git a/tika-core/src/test/resources/org/apache/tika/config/FileCommandDetector.xml b/tika-core/src/test/resources/org/apache/tika/config/FileCommandDetector.xml new file mode 100644 index 0000000..cd8dc28 --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/FileCommandDetector.xml @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <detectors class="org.apache.tika.detect.DefaultDetector"> + <detector class="org.apache.tika.detect.FileCommandDetector"/> + </detectors> +</properties> \ No newline at end of file diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestFileCommandDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestFileCommandDetector.java new file mode 100644 index 0000000..4ac6574 --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestFileCommandDetector.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect; + +import org.apache.tika.MultiThreadedTikaTest; +import org.apache.tika.utils.XMLReaderUtils; +import org.junit.Test; + +import java.io.File; +import java.io.FileFilter; +import java.util.Random; + +import static org.junit.Assume.assumeTrue; + +public class TestFileCommandDetector extends MultiThreadedTikaTest { + + @Test + public void testFileDetectorMultiThreaded() throws Exception { + assumeTrue(FileCommandDetector.checkHasFile()); + Detector detector = new FileCommandDetector(); + FileFilter filter = new FileFilter() { + //TODO: create proper randomized framework that will record seed, etc... + private final Random random = new Random(); + //increase this to the number of files for a true smoke test + //for now, randomly pick 20 files. + int toProcess = 20; + int processed = 0; + @Override + public boolean accept(File pathname) { + if (processed >= toProcess) { + return false; + } else if (random.nextBoolean()) { + processed++; + return true; + } + return false; + } + }; + int numThreads = 5; + XMLReaderUtils.setPoolSize(numThreads); + + testDetector(detector, numThreads, 20, filter, numThreads*3); + + } +}
