This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 69fd2c6b18d9eaa4b5977620bc8bf2d572cf456e
Author: tallison <[email protected]>
AuthorDate: Thu Oct 29 10:57:03 2020 -0400

    TIKA-3215 -- add a wrapper for the commandline linux file command as a 
detector
    
    # Conflicts:
    #   
tika-parsers/src/test/java/org/apache/tika/detect/TestFileCommandDetector.java
---
 .../apache/tika/detect/FileCommandDetector.java    | 207 +++++++++++++++++++++
 .../tika/detect/FileCommandDetectorTest.java       |  44 +++++
 .../org/apache/tika/config/FileCommandDetector.xml |  22 +++
 .../tika/detect/TestFileCommandDetector.java       |  59 ++++++
 4 files changed, 332 insertions(+)

diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java 
b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
new file mode 100644
index 0000000..b68991d
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.io.BoundedInputStream;
+import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.utils.ProcessUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.nio.file.StandardCopyOption.REPLACE_EXISTING;
+
+/**
+ * This runs the linux 'file' command against a file.  If
+ * this is called on a TikaInputStream, it will use the underlying Path
+ * or spool the full file to disk and then run file against that.
+ *
+ * If this is run against any other type of InputStream, it will spool
+ * up to {@link #maxBytes} to disk and then run the detector.
+ *
+ * As with all detectors, mark must be supported.
+ */
+public class FileCommandDetector implements Detector {
+
+    //TODO: file has some diff mimes names for some very common mimes
+    //should we map file mimes to Tika mimes, e.g. text/xml -> 
application/xml??
+
+    private static final Logger LOGGER = 
LoggerFactory.getLogger(FileCommandDetector.class);
+    private static boolean HAS_WARNED = false;
+    private static final long DEFAULT_TIMEOUT_MS = 6000;
+    private static String DEFAULT_FILE_COMMAND_PATH = "file";
+    private Boolean hasFileCommand = null;
+    private String fileCommandPath = DEFAULT_FILE_COMMAND_PATH;
+    private int maxBytes = 1_000_000;
+    private long timeoutMs = DEFAULT_TIMEOUT_MS;
+
+    static boolean checkHasFile() {
+        return checkHasFile(DEFAULT_FILE_COMMAND_PATH);
+    }
+
+
+    static boolean checkHasFile(String fileCommandPath) {
+        String[] commandline = new String[]{
+            fileCommandPath, "-v"
+        };
+        return ExternalParser.check(commandline);
+    }
+
+    @Override
+    public MediaType detect(InputStream input, Metadata metadata) throws 
IOException {
+        if (hasFileCommand == null) {
+            hasFileCommand = checkHasFile(this.fileCommandPath);
+        }
+        if (!hasFileCommand) {
+            if (! HAS_WARNED) {
+                LOGGER.warn("'file' command isn't working: 
'"+fileCommandPath+"'");
+                HAS_WARNED = true;
+            }
+            return MediaType.OCTET_STREAM;
+        }
+        TikaInputStream tis = TikaInputStream.cast(input);
+        if (tis != null) {
+            //spool the full file to disk, if called with a TikaInputStream
+            //and there is no underlying file
+            return detectOnPath(tis.getPath());
+        }
+
+        input.mark(maxBytes);
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            Path tmpFile = tmp.createTempFile();
+            Files.copy(new BoundedInputStream(maxBytes, input), tmpFile, 
REPLACE_EXISTING);
+            return detectOnPath(tmpFile);
+        } finally {
+            tmp.close();
+            input.reset();
+        }
+    }
+
+    private MediaType detectOnPath(Path path) throws IOException {
+
+        String[] args = new String[]{
+                ProcessUtils.escapeCommandLine(fileCommandPath),
+                "-b", "--mime-type",
+                
ProcessUtils.escapeCommandLine(path.toAbsolutePath().toString())
+        };
+        ProcessBuilder builder = new ProcessBuilder(args);
+        Process process = builder.start();
+        StringStreamGobbler errorGobbler = new 
StringStreamGobbler(process.getErrorStream());
+        StringStreamGobbler outGobbler = new 
StringStreamGobbler(process.getInputStream());
+        Thread errorThread = new Thread(errorGobbler);
+        Thread outThread = new Thread(outGobbler);
+        errorThread.start();
+        outThread.start();
+
+        process.getErrorStream();
+        process.getInputStream();
+
+        boolean finished = false;
+        try {
+            finished = process.waitFor(timeoutMs, TimeUnit.MILLISECONDS);
+            if (!finished) {
+                process.destroyForcibly();
+                throw new IOExceptionWithCause(new TimeoutException("timed 
out"));
+            }
+            int exitValue = process.exitValue();
+            if (exitValue != 0) {
+                throw new IOExceptionWithCause(new RuntimeException("bad exit 
value"));
+            }
+            errorThread.join();
+            outThread.join();
+        } catch (InterruptedException e) {
+        }
+        return MediaType.parse(outGobbler.toString().trim());
+    }
+
+    @Field
+    public void setFilePath(String fileCommandPath) {
+        //this opens up a potential command vulnerability.
+        //Don't ever let an untrusted user set this.
+        this.fileCommandPath = fileCommandPath;
+        checkHasFile(this.fileCommandPath);
+    }
+
+    /**
+     * If this is not called on a TikaInputStream, this detector
+     * will spool up to this many bytes to a file to be detected
+     * by the 'file' command.
+     *
+     * @param maxBytes
+     */
+    @Field
+    public void setMaxBytes(int maxBytes) {
+        this.maxBytes = maxBytes;
+    }
+
+    @Field
+    public void setTimeoutMs(long timeoutMs) {
+        this.timeoutMs = timeoutMs;
+    }
+
+    private static class StringStreamGobbler implements Runnable {
+
+        //plagiarized from org.apache.oodt's StreamGobbler
+        private final BufferedReader reader;
+        private final StringBuilder sb = new StringBuilder();
+
+        public StringStreamGobbler(InputStream is) {
+            this.reader = new BufferedReader(new InputStreamReader(new 
BufferedInputStream(is), UTF_8));
+        }
+
+        @Override
+        public void run() {
+            String line = null;
+            try {
+                while ((line = reader.readLine()) != null) {
+                    sb.append(line);
+                    sb.append("\n");
+                }
+            } catch (IOException e) {
+                //swallow ioe
+            }
+        }
+
+        public void stopGobblingAndDie() {
+            IOUtils.closeQuietly(reader);
+        }
+
+        @Override
+        public String toString() {
+            return sb.toString();
+        }
+
+    }
+}
diff --git 
a/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java 
b/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java
new file mode 100644
index 0000000..b5d49ec
--- /dev/null
+++ 
b/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java
@@ -0,0 +1,44 @@
+package org.apache.tika.detect;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.InputStream;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assume.assumeTrue;
+
+public class FileCommandDetectorTest {
+
+    private static Detector DETECTOR;
+
+    @BeforeClass
+    public static void setUp() throws Exception {
+        try (InputStream is = 
TikaConfig.class.getResourceAsStream("FileCommandDetector.xml")) {
+            DETECTOR = new TikaConfig(is).getDetector();
+        }
+    }
+
+    @Test
+    public void testBasic() throws Exception {
+        assumeTrue(FileCommandDetector.checkHasFile());
+
+        try (InputStream is = 
getClass().getResourceAsStream("/test-documents/basic_embedded.xml")) {
+            assertEquals(MediaType.text("xml"), DETECTOR.detect(is, new 
Metadata()));
+            //make sure that the detector is resetting the stream
+            assertEquals(MediaType.text("xml"), DETECTOR.detect(is, new 
Metadata()));
+        }
+
+        //now try with TikaInputStream
+        try (InputStream is = TikaInputStream.get(getClass()
+                .getResourceAsStream("/test-documents/basic_embedded.xml"))) {
+            assertEquals(MediaType.text("xml"), DETECTOR.detect(is, new 
Metadata()));
+            //make sure that the detector is resetting the stream
+            assertEquals(MediaType.text("xml"), DETECTOR.detect(is, new 
Metadata()));
+        }
+    }
+}
diff --git 
a/tika-core/src/test/resources/org/apache/tika/config/FileCommandDetector.xml 
b/tika-core/src/test/resources/org/apache/tika/config/FileCommandDetector.xml
new file mode 100644
index 0000000..cd8dc28
--- /dev/null
+++ 
b/tika-core/src/test/resources/org/apache/tika/config/FileCommandDetector.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <detectors class="org.apache.tika.detect.DefaultDetector">
+        <detector class="org.apache.tika.detect.FileCommandDetector"/>
+    </detectors>
+</properties>
\ No newline at end of file
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/detect/TestFileCommandDetector.java
 
b/tika-parsers/src/test/java/org/apache/tika/detect/TestFileCommandDetector.java
new file mode 100644
index 0000000..4ac6574
--- /dev/null
+++ 
b/tika-parsers/src/test/java/org/apache/tika/detect/TestFileCommandDetector.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import org.apache.tika.MultiThreadedTikaTest;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.util.Random;
+
+import static org.junit.Assume.assumeTrue;
+
+public class TestFileCommandDetector extends MultiThreadedTikaTest {
+
+    @Test
+    public void testFileDetectorMultiThreaded() throws Exception {
+        assumeTrue(FileCommandDetector.checkHasFile());
+        Detector detector = new FileCommandDetector();
+        FileFilter filter = new FileFilter() {
+            //TODO: create proper randomized framework that will record seed, 
etc...
+            private final Random random = new Random();
+            //increase this to the number of files for a true smoke test
+            //for now, randomly pick 20 files.
+            int toProcess = 20;
+            int processed = 0;
+            @Override
+            public boolean accept(File pathname) {
+                if (processed >= toProcess) {
+                    return false;
+                } else if (random.nextBoolean()) {
+                    processed++;
+                    return true;
+                }
+                return false;
+            }
+        };
+        int numThreads = 5;
+        XMLReaderUtils.setPoolSize(numThreads);
+
+        testDetector(detector, numThreads, 20, filter, numThreads*3);
+
+    }
+}

Reply via email to