This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 4cf115c0e2 TIKA-4716 (#2755)
4cf115c0e2 is described below
commit 4cf115c0e22e758447f23488ac5a0c1ffd46acb4
Author: Tim Allison <[email protected]>
AuthorDate: Thu Apr 9 13:25:10 2026 -0400
TIKA-4716 (#2755)
---
.github/workflows/main-jdk17-build.yml | 2 +
CHANGES.txt | 3 +
.../ROOT/examples/external-parser-exiftool.json | 1 +
.../ROOT/examples/external-parser-ffmpeg.json | 1 +
.../ROOT/examples/external-parser-multi.json | 1 +
.../modules/ROOT/examples/external-parser-sox.json | 1 +
docs/modules/ROOT/nav.adoc | 1 +
.../configuration/parsers/external-parser.adoc | 176 +++++++
docs/modules/ROOT/pages/security.adoc | 18 +-
.../services/org.apache.tika.parser.Parser | 16 -
.../apache/tika/detect/FileCommandDetector.java | 3 +-
.../org/apache/tika/embedder/ExternalEmbedder.java | 13 +-
.../parser/external/CompositeExternalParser.java | 44 --
.../tika/parser/external/ExternalParser.java | 581 +++++++--------------
.../ExternalParserConfig.java | 69 ++-
.../external/ExternalParsersConfigReader.java | 223 --------
.../ExternalParsersConfigReaderMetKeys.java | 43 --
.../parser/external/ExternalParsersFactory.java | 67 ---
.../apache/tika/parser/external/package-info.java | 22 -
.../tika/parser/external2/ExternalParser.java | 227 --------
.../java/org/apache/tika/utils/ProcessUtils.java | 74 +++
.../java/org/apache/tika/utils/StreamGobbler.java | 52 +-
.../tika/parser/external/tika-external-parsers.xml | 117 -----
.../tika/detect/siegfried/SiegfriedDetector.java | 3 +-
.../org/apache/tika/parser/gdal/GDALParser.java | 5 +-
.../apache/tika/parser/gdal/TestGDALParser.java | 6 +-
.../parser/scientific/integration/TestParsers.java | 11 -
.../apache/tika/parser/dwg/DWGParserConfig.java | 4 +-
.../org/apache/tika/parser/dwg/DWGParserTest.java | 4 +-
.../apache/tika/parser/ocr/TesseractOCRParser.java | 6 +-
.../renderer/pdf/poppler/PopplerRendererTest.java | 4 +-
.../apache/tika/parser/pkg/UnrarParserTest.java | 4 +-
.../apache/tika/parser/strings/StringsParser.java | 6 +-
.../tika/parser/strings/StringsParserTest.java | 4 +-
.../services/org.apache.tika.parser.Parser | 16 -
.../apache/tika/parser/AutoDetectParserTest.java | 6 -
.../org/apache/tika/parser/pdf/PDFParserTest.java | 4 +-
.../apache/tika/parser/pkg/UnrarParserTest.java | 4 +-
.../tika/parser/external/ExternalParserTest.java | 195 +++++++
.../tika/parser/external2/ExternalParserTest.java | 100 ----
.../configs/TIKA-3557-exiftool-example.json | 6 +-
.../src/test/resources/configs/TIKA-3557.json | 2 +-
...-example.json => external-parser-exiftool.json} | 15 +-
.../resources/configs/external-parser-ffmpeg.json | 35 ++
.../resources/configs/external-parser-multi.json | 47 ++
.../resources/configs/external-parser-sox.json | 37 ++
.../services/org.apache.tika.parser.Parser | 16 -
.../tika/server/standard/TikaParsersTest.java | 4 +-
48 files changed, 952 insertions(+), 1347 deletions(-)
diff --git a/.github/workflows/main-jdk17-build.yml
b/.github/workflows/main-jdk17-build.yml
index 75c923c10e..3f60fd98a6 100644
--- a/.github/workflows/main-jdk17-build.yml
+++ b/.github/workflows/main-jdk17-build.yml
@@ -43,6 +43,8 @@ jobs:
distribution: 'temurin'
java-version: ${{ matrix.java }}
cache: 'maven'
+ - name: Install external tools
+ run: sudo apt-get update && sudo apt-get install -y ffmpeg
libimage-exiftool-perl
- name: Build with Maven
run: mvn clean apache-rat:check test install javadoc:aggregate -Pci -B
"-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn"
diff --git a/CHANGES.txt b/CHANGES.txt
index b135cb3894..ada77669ba 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -29,6 +29,9 @@ Release 4.0.0-BETA1 - ???
* Removed DigestingParser (TIKA-4607).
+ * Removed legacy ExternalParser; external parsers now require explicit
+ JSON configuration (TIKA-4707).
+
OTHER CHANGES
* Fix concurrency bug in TikaToXMP (TIKA-4393)
diff --git a/docs/modules/ROOT/examples/external-parser-exiftool.json
b/docs/modules/ROOT/examples/external-parser-exiftool.json
new file mode 120000
index 0000000000..145dcb2515
--- /dev/null
+++ b/docs/modules/ROOT/examples/external-parser-exiftool.json
@@ -0,0 +1 @@
+../../../../tika-serialization/src/test/resources/configs/external-parser-exiftool.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/external-parser-ffmpeg.json
b/docs/modules/ROOT/examples/external-parser-ffmpeg.json
new file mode 120000
index 0000000000..024b6de0af
--- /dev/null
+++ b/docs/modules/ROOT/examples/external-parser-ffmpeg.json
@@ -0,0 +1 @@
+../../../../tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/external-parser-multi.json
b/docs/modules/ROOT/examples/external-parser-multi.json
new file mode 120000
index 0000000000..9fd360037c
--- /dev/null
+++ b/docs/modules/ROOT/examples/external-parser-multi.json
@@ -0,0 +1 @@
+../../../../tika-serialization/src/test/resources/configs/external-parser-multi.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/external-parser-sox.json
b/docs/modules/ROOT/examples/external-parser-sox.json
new file mode 120000
index 0000000000..1c996f4169
--- /dev/null
+++ b/docs/modules/ROOT/examples/external-parser-sox.json
@@ -0,0 +1 @@
+../../../../tika-serialization/src/test/resources/configs/external-parser-sox.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index 1702591425..ea3e9726a9 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -28,6 +28,7 @@
** xref:configuration/parsers/pdf-parser.adoc[PDF Parser]
** xref:configuration/parsers/tesseract-ocr-parser.adoc[Tesseract OCR]
** xref:configuration/parsers/vlm-parsers.adoc[VLM Parsers (Claude, Gemini,
OpenAI)]
+** xref:configuration/parsers/external-parser.adoc[External Parser (ffmpeg,
exiftool, etc.)]
** xref:configuration/parsers/tess4j-parser.adoc[Tess4J OCR (In-Process)]
* xref:migration-to-4x/index.adoc[Migration to 4.x]
** xref:migration-to-4x/migrating-to-4x.adoc[Migration Guide]
diff --git a/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc
b/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc
new file mode 100644
index 0000000000..281bd6d12d
--- /dev/null
+++ b/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc
@@ -0,0 +1,176 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= External Parser Configuration
+
+The `ExternalParser` allows Tika to delegate parsing to external command-line
+programs such as `ffmpeg`, `exiftool`, or `sox`. Each external parser is
+configured via JSON and must be explicitly enabled -- Tika 4.x does not
+auto-discover external tools at startup.
+
+== Key Concepts
+
+=== Lazy Check
+
+Each external parser can declare a `checkCommandLine` that verifies the tool
+is installed. The check runs lazily on first use (not at startup), and if the
+tool is not found, the parser silently disables itself.
+
+=== Stream Handlers
+
+An external process produces up to three output streams. Each can have an
+independent handler (any Tika parser):
+
+* **`stdoutHandler`** -- processes stdout
+* **`stderrHandler`** -- processes stderr
+* **`outputFileHandler`** -- processes the output file (when `${OUTPUT_FILE}`
is used)
+
+Handlers extract metadata, content, or both. `regex-capture-parser` is the
+most common choice for extracting metadata via regex patterns.
+
+=== Content Source
+
+The `contentSource` field controls which stream provides the XHTML text
content:
+
+* `"stdout"` -- default when no `${OUTPUT_FILE}` in the command
+* `"outputFile"` -- default when `${OUTPUT_FILE}` is in the command
+* `"stderr"` -- use stderr as the content source
+* `"none"` -- metadata-only mode, no text content extracted
+
+When a handler is configured for the content source stream, its
+ContentHandler output becomes the XHTML content. When no handler is
+configured, the raw bytes are written as text.
+
+== Configuration Options
+
+[cols="1,1,3"]
+|===
+|Field |Type |Description
+
+|`commandLine`
+|`List<String>`
+|The command and arguments to run. Use `${INPUT_FILE}` and `${OUTPUT_FILE}`
tokens for file paths.
+
+|`checkCommandLine`
+|`List<String>`
+|Optional. Command to verify the tool is installed (e.g., `["ffmpeg",
"-version"]`).
+
+|`checkErrorCodes`
+|`List<Integer>`
+|Exit codes that indicate the tool is not available. Default: `[127]`.
+
+|`stdoutHandler`
+|Parser config
+|Optional. Parser to process stdout.
+
+|`stderrHandler`
+|Parser config
+|Optional. Parser to process stderr.
+
+|`outputFileHandler`
+|Parser config
+|Optional. Parser to process the output file.
+
+|`contentSource`
+|`String`
+|Which stream provides XHTML content: `"stdout"`, `"stderr"`, `"outputFile"`,
or `"none"`. Default depends on command.
+
+|`returnStdout`
+|`boolean`
+|Store raw stdout in metadata. Default: `false`.
+
+|`returnStderr`
+|`boolean`
+|Store raw stderr in metadata. Default: `true`.
+
+|`timeoutMs`
+|`long`
+|Process timeout in milliseconds. Default: `60000`.
+
+|`maxStdOut`
+|`int`
+|Maximum stdout bytes to capture. Default: `10000`.
+
+|`maxStdErr`
+|`int`
+|Maximum stderr bytes to capture. Default: `10000`.
+|===
+
+== Examples
+
+=== Exiftool (metadata from stdout)
+
+Extracts metadata from media files using `exiftool`. The `stdoutHandler` uses
+`regex-capture-parser` to extract key-value pairs from exiftool's stdout.
+
+[source,json]
+----
+include::example$external-parser-exiftool.json[]
+----
+icon:github[]
https://github.com/apache/tika/blob/main/tika-serialization/src/test/resources/configs/external-parser-exiftool.json[View
source on GitHub]
+
+=== FFmpeg (metadata from stderr)
+
+Extracts audio/video metadata from `ffmpeg -i` output. FFmpeg writes metadata
+to stderr, so this uses `stderrHandler`.
+
+[source,json]
+----
+include::example$external-parser-ffmpeg.json[]
+----
+icon:github[]
https://github.com/apache/tika/blob/main/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json[View
source on GitHub]
+
+=== Sox (audio metadata from stderr)
+
+Extracts audio metadata using `sox --info`. Like FFmpeg, Sox writes to stderr.
+
+[source,json]
+----
+include::example$external-parser-sox.json[]
+----
+icon:github[]
https://github.com/apache/tika/blob/main/tika-serialization/src/test/resources/configs/external-parser-sox.json[View
source on GitHub]
+
+=== Multiple External Parsers
+
+You can configure multiple external parsers in a single config file. Each
+handles different MIME types via `_mime-include`. Here FFmpeg handles video
+files while exiftool handles PDFs:
+
+[source,json]
+----
+include::example$external-parser-multi.json[]
+----
+icon:github[]
https://github.com/apache/tika/blob/main/tika-serialization/src/test/resources/configs/external-parser-multi.json[View
source on GitHub]
+
+== Changes from 3.x
+
+In Tika 3.x, external parsers were configured via XML
(`tika-external-parsers.xml`)
+and auto-discovered at startup. The `CompositeExternalParser` would fork
+a process for each configured tool on every Tika initialization to check
+if the tool was available.
+
+In Tika 4.x:
+
+* External parsers must be explicitly configured in JSON -- no auto-discovery.
+* The `checkCommandLine` runs lazily on first use, not at startup.
+* Three independent stream handlers (`stdoutHandler`, `stderrHandler`,
+ `outputFileHandler`) replace the old `outputParser`/`stderrParser` split.
+* The `contentSource` field explicitly controls which stream provides text
content.
+* `CompositeExternalParser`, `ExternalParsersFactory`, and the XML config
+ reader have been removed.
+
+See xref:migration-to-4x/migrating-to-4x.adoc[Migrating to 4.x] for general
migration guidance.
diff --git a/docs/modules/ROOT/pages/security.adoc
b/docs/modules/ROOT/pages/security.adoc
index 0357714508..3524da1521 100644
--- a/docs/modules/ROOT/pages/security.adoc
+++ b/docs/modules/ROOT/pages/security.adoc
@@ -55,10 +55,26 @@ Never allow untrusted users to configure command paths or
arguments.
4. **Audit command configuration**: Regularly review configured external
commands and
their arguments.
+=== ExternalParser-Specific Risks
+
+* **checkCommandLine runs at type-query time**: If configured, the check
command
+ executes the first time `getSupportedTypes()` is called -- not at parse time.
+ This means merely querying which parsers are available can trigger process
execution.
+
+* **stderr information leakage**: External programs often write file paths,
system
+ usernames, version strings, and internal errors to stderr. By default,
+ `returnStderr` is `false` to prevent this data from leaking into metadata.
+ If you enable `returnStderr`, be aware that the raw stderr content will be
+ stored in the document's metadata and may be visible to end users.
+
+* **Buffer limits**: The `maxStdOut` and `maxStdErr` settings control how much
+ process output is captured in memory. Set these to reasonable values for your
+ deployment to prevent memory exhaustion from misbehaving external programs.
+
=== Affected Components
* `FileCommandDetector`: Uses the system `file` command for MIME type detection
-* `ExternalParser`: Executes arbitrary external programs to extract content
+* `ExternalParser`: Executes configured external programs to extract content
* `ExternalEmbedder`: Uses external tools to embed content
== Credential Handling
diff --git
a/tika-app/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
b/tika-app/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
deleted file mode 100644
index 37f87a4595..0000000000
---
a/tika-app/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-org.apache.tika.parser.external.CompositeExternalParser
\ No newline at end of file
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
index 83182eafc4..1433bc73ef 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
@@ -29,7 +29,6 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.utils.FileProcessResult;
import org.apache.tika.utils.ProcessUtils;
import org.apache.tika.utils.StringUtils;
@@ -74,7 +73,7 @@ public class FileCommandDetector implements Detector {
public static boolean checkHasFile(String fileCommandPath) {
String[] commandline = new String[]{fileCommandPath, "-v"};
- return ExternalParser.check(commandline);
+ return ProcessUtils.checkCommand(commandline);
}
/**
diff --git
a/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
b/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
index c58d57345f..4cdfbea850 100644
--- a/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
+++ b/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
@@ -40,7 +40,6 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.external.ExternalParser;
/**
* Embedder that uses an external program (like sed or exiftool) to embed text
@@ -50,6 +49,8 @@ import org.apache.tika.parser.external.ExternalParser;
*/
public class ExternalEmbedder implements Embedder {
+ public static final String INPUT_FILE_TOKEN = "${INPUT}";
+ public static final String OUTPUT_FILE_TOKEN = "${OUTPUT}";
/**
* Token to be replaced with a String array of metadata assignment command
* arguments
@@ -78,7 +79,7 @@ public class ExternalEmbedder implements Embedder {
*/
private String[] command =
new String[]{"sed", "-e", "$a\\\n" +
METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN,
- ExternalParser.INPUT_FILE_TOKEN};
+ INPUT_FILE_TOKEN};
private String commandAssignmentOperator = "=";
private String commandAssignmentDelimeter = ", ";
private String commandAppendOperator = "=";
@@ -355,15 +356,15 @@ public class ExternalEmbedder implements Embedder {
String[] origCmd = command;
List<String> cmd = new ArrayList<>();
for (String commandSegment : origCmd) {
- if (commandSegment.contains(ExternalParser.INPUT_FILE_TOKEN)) {
- commandSegment =
commandSegment.replace(ExternalParser.INPUT_FILE_TOKEN,
+ if (commandSegment.contains(INPUT_FILE_TOKEN)) {
+ commandSegment = commandSegment.replace(INPUT_FILE_TOKEN,
tikaInputStream.getFile().toString());
inputToStdIn = false;
}
- if (commandSegment.contains(ExternalParser.OUTPUT_FILE_TOKEN)) {
+ if (commandSegment.contains(OUTPUT_FILE_TOKEN)) {
tempOutputFile = tmp.createTemporaryFile();
commandSegment = commandSegment
- .replace(ExternalParser.OUTPUT_FILE_TOKEN,
tempOutputFile.toString());
+ .replace(OUTPUT_FILE_TOKEN, tempOutputFile.toString());
outputFromStdOut = false;
}
if
(commandSegment.contains(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN)) {
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java
b/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java
deleted file mode 100644
index 53cb7b7eac..0000000000
---
a/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.external;
-
-import java.io.IOException;
-import java.util.List;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.mime.MediaTypeRegistry;
-import org.apache.tika.parser.CompositeParser;
-import org.apache.tika.parser.Parser;
-
-/**
- * A Composite Parser that wraps up all the available External Parsers,
- * and provides an easy way to access them.
- * Parser that uses an external program (like catdoc or pdf2txt) to extract
- * text content and metadata from a given document.
- */
-public class CompositeExternalParser extends CompositeParser {
- private static final long serialVersionUID = 6962436916649024024L;
-
- public CompositeExternalParser() throws IOException, TikaException {
- this(new MediaTypeRegistry());
- }
-
- @SuppressWarnings("unchecked")
- public CompositeExternalParser(MediaTypeRegistry registry) throws
IOException, TikaException {
- super(registry, (List<Parser>) (List<? extends Parser>)
ExternalParsersFactory.create());
- }
-}
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
index 0e17384928..efd3856cba 100644
---
a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
+++
b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
@@ -16,461 +16,284 @@
*/
package org.apache.tika.parser.external;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStream;
-import java.io.Reader;
-import java.io.Serializable;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
-import java.util.Map;
+import java.util.List;
import java.util.Set;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.TimeoutException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.output.NullOutputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.apache.tika.config.ConfigDeserializer;
+import org.apache.tika.config.JsonConfig;
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.config.TikaProgressTracker;
+import org.apache.tika.config.TimeoutLimits;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.ExternalProcess;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.FileProcessResult;
+import org.apache.tika.utils.ProcessUtils;
/**
- * Parser that uses an external program (like catdoc or pdf2txt) to extract
- * text content and metadata from a given document.
- *
- * @deprecated Use {@link org.apache.tika.parser.external2.ExternalParser}
instead.
- * This class will be removed in a future version of Tika.
+ * Parser that uses an external program (like ffmpeg, exiftool or sox)
+ * to extract text content and metadata from a given document.
+ * <p>
+ * This parser relies on JSON configuration rather than classpath
auto-discovery.
+ * Users can specify independent handlers for each process stream:
+ * <ul>
+ * <li>{@code stdoutHandler} — processes stdout</li>
+ * <li>{@code stderrHandler} — processes stderr</li>
+ * <li>{@code outputFileHandler} — processes the output file</li>
+ * </ul>
+ * The {@code contentSource} field controls which stream provides the XHTML
+ * content output. An optional {@code checkCommandLine} lazily verifies the
+ * external tool is available.
*/
-@Deprecated
+@TikaComponent
public class ExternalParser implements Parser {
- private static final Logger LOG =
LoggerFactory.getLogger(ExternalParser.class);
+ public static final long DEFAULT_TIMEOUT_MS = 60000;
- /**
- * The token, which if present in the Command string, will
- * be replaced with the input filename.
- * Alternately, the input data can be streamed over STDIN.
- */
- public static final String INPUT_FILE_TOKEN = "${INPUT}";
- /**
- * The token, which if present in the Command string, will
- * be replaced with the output filename.
- * Alternately, the output data can be collected on STDOUT.
- */
- public static final String OUTPUT_FILE_TOKEN = "${OUTPUT}";
- private static final long serialVersionUID = -1079128990650687037L;
- //make this parameterizable
- private final long timeoutMs = 60000;
- /**
- * Media types supported by the external program.
- */
- private Set<MediaType> supportedTypes = Collections.emptySet();
-
- /**
- * Regular Expressions to run over STDOUT to
- * extract Metadata.
- */
- private Map<Pattern, String> metadataPatterns = null;
- /**
- * The external command to invoke.
- *
- * @see Runtime#exec(String[])
- */
- private String[] command = new String[]{"cat"};
- /**
- * A consumer for ignored Lines
- */
- private LineConsumer ignoredLineConsumer = LineConsumer.NULL;
-
- /**
- * Starts a thread that reads and discards the contents of the
- * standard stream of the given process. Potential exceptions
- * are ignored, and the stream is closed once fully processed.
- * Note: calling this starts a new thread and blocks the current(caller)
- * thread until the new thread dies
- *
- * @param stream stream to be ignored
- */
- private static void ignoreStream(final InputStream stream) {
- ignoreStream(stream, true);
- }
-
- /**
- * Starts a thread that reads and discards the contents of the
- * standard stream of the given process. Potential exceptions
- * are ignored, and the stream is closed once fully processed.
- *
- * @param stream stream to sent to black hole (a k a null)
- * @param waitForDeath when {@code true} the caller thread will be
- * blocked till the death of new thread.
- * @return The thread that is created and started
- */
- private static Thread ignoreStream(final InputStream stream, boolean
waitForDeath) {
- Thread t = new Thread(() -> {
- try {
- IOUtils.copy(stream, NullOutputStream.INSTANCE);
- } catch (IOException e) {
- //swallow
- } finally {
- IOUtils.closeQuietly(stream);
- }
- });
- t.start();
- if (waitForDeath) {
- try {
- t.join();
- } catch (InterruptedException ignore) {
- }
- }
- return t;
- }
+ public static final String INPUT_FILE_TOKEN = "${INPUT_FILE}";
- /**
- * Checks to see if the command can be run. Typically used with
- * something like "myapp --version" to check to see if "myapp"
- * is installed and on the path.
- *
- * @param checkCmd The check command to run
- * @param errorValue What is considered an error value?
- */
- public static boolean check(String checkCmd, int... errorValue) {
- return check(new String[]{checkCmd}, errorValue);
- }
+ public static final String OUTPUT_FILE_TOKEN = "${OUTPUT_FILE}";
- public static boolean check(String[] checkCmd, int... errorValue) {
- if (errorValue.length == 0) {
- errorValue = new int[]{127};
- }
+ private static final Pattern INPUT_TOKEN_MATCHER =
+ Pattern.compile("\\$\\{INPUT_FILE}");
+ private static final Pattern OUTPUT_TOKEN_MATCHER =
+ Pattern.compile("\\$\\{OUTPUT_FILE}");
- Process process = null;
- try {
- process = Runtime.getRuntime().exec(checkCmd);
- Thread stdErrSuckerThread = ignoreStream(process.getErrorStream(),
false);
- Thread stdOutSuckerThread = ignoreStream(process.getInputStream(),
false);
- stdErrSuckerThread.join();
- stdOutSuckerThread.join();
- //make the timeout parameterizable
- boolean finished = process.waitFor(60000, TimeUnit.MILLISECONDS);
- if (!finished) {
- throw new TimeoutException();
- }
- int result = process.exitValue();
- LOG.debug("exit value for {}: {}", checkCmd[0], result);
- for (int err : errorValue) {
- if (result == err) {
- return false;
- }
- }
- return true;
- } catch (IOException | InterruptedException | TimeoutException e) {
- LOG.debug("exception trying to run " + checkCmd[0], e);
- // Some problem, command is there or is broken
- return false;
- } catch (SecurityException se) {
- // External process execution is banned by the security manager
- throw se;
- } catch (Error err) {
- if (err.getMessage() != null &&
(err.getMessage().contains("posix_spawn") ||
- err.getMessage().contains("UNIXProcess"))) {
- LOG.debug("(TIKA-1526): exception trying to run: " +
checkCmd[0], err);
- //"Error forking command due to JVM locale bug
- //(see TIKA-1526 and SOLR-6387)"
- return false;
- }
- //throw if a different kind of error
- throw err;
- } finally {
- if (process != null) {
- process.destroyForcibly();
- }
- }
- }
+ private static final Logger LOG =
LoggerFactory.getLogger(ExternalParser.class);
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return getSupportedTypes();
- }
+ private static final ContentHandler DISCARD_HANDLER =
+ new org.xml.sax.helpers.DefaultHandler();
- public Set<MediaType> getSupportedTypes() {
- return supportedTypes;
- }
+ private final ExternalParserConfig config;
- public void setSupportedTypes(Set<MediaType> supportedTypes) {
- this.supportedTypes = Collections.unmodifiableSet(new
HashSet<>(supportedTypes));
- }
+ // Cached values derived from config
+ private final Set<MediaType> supportedTypes;
+ private final List<String> commandLine;
+ private final Parser stdoutHandler;
+ private final Parser stderrHandler;
+ private final Parser outputFileHandler;
- public String[] getCommand() {
- return command;
- }
+ // Lazy check state
+ private final String[] checkCmd;
+ private final int[] checkErrorCodes;
+ private volatile Boolean checkResult;
/**
- * Sets the command to be run. This can include either of
- * {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN}
- * if the command needs filenames.
- *
- * @see Runtime#exec(String[])
+ * Default constructor - not typically useful since ExternalParser
requires configuration.
*/
- public void setCommand(String... command) {
- this.command = command;
+ public ExternalParser() {
+ this(new ExternalParserConfig());
}
/**
- * Gets lines consumer
- *
- * @return consumer instance
+ * Programmatic constructor with typed config.
*/
- public LineConsumer getIgnoredLineConsumer() {
- return ignoredLineConsumer;
+ public ExternalParser(ExternalParserConfig config) {
+ this.config = config;
+ this.supportedTypes = new HashSet<>();
+ for (String s : config.getSupportedTypes()) {
+ this.supportedTypes.add(MediaType.parse(s));
+ }
+ this.commandLine = new ArrayList<>(config.getCommandLine());
+ this.stdoutHandler = config.getStdoutHandler();
+ this.stderrHandler = config.getStderrHandler();
+ this.outputFileHandler = config.getOutputFileHandler();
+
+ // Set up lazy check
+ if (config.getCheckCommandLine() != null &&
!config.getCheckCommandLine().isEmpty()) {
+ this.checkCmd = config.getCheckCommandLine().toArray(new
String[0]);
+ if (config.getCheckErrorCodes() != null &&
+ !config.getCheckErrorCodes().isEmpty()) {
+ this.checkErrorCodes = config.getCheckErrorCodes().stream()
+ .mapToInt(Integer::intValue).toArray();
+ } else {
+ this.checkErrorCodes = new int[]{127};
+ }
+ this.checkResult = null; // will be lazily evaluated
+ } else {
+ this.checkCmd = null;
+ this.checkErrorCodes = null;
+ this.checkResult = Boolean.TRUE; // no check configured, always
available
+ }
}
/**
- * Set a consumer for the lines ignored by the parse functions
- *
- * @param ignoredLineConsumer consumer instance
+ * JSON config constructor - used for deserialization.
*/
- public void setIgnoredLineConsumer(LineConsumer ignoredLineConsumer) {
- this.ignoredLineConsumer = ignoredLineConsumer;
- }
-
- public Map<Pattern, String> getMetadataExtractionPatterns() {
- return metadataPatterns;
+ public ExternalParser(JsonConfig jsonConfig) {
+ this(ConfigDeserializer.buildConfig(jsonConfig,
ExternalParserConfig.class));
}
- /**
- * Sets the map of regular expression patterns and Metadata
- * keys. Any matching patterns will have the matching
- * metadata entries set.
- * Set this to null to disable Metadata extraction.
- */
- public void setMetadataExtractionPatterns(Map<Pattern, String> patterns) {
- this.metadataPatterns = patterns;
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ if (checkResult == null) {
+ synchronized (this) {
+ if (checkResult == null) {
+ checkResult = ProcessUtils.checkCommand(checkCmd,
checkErrorCodes);
+ }
+ }
+ }
+ return checkResult ? supportedTypes : Collections.emptySet();
}
- /**
- * Executes the configured external command and passes the given document
- * stream as a simple XHTML document to the given SAX content handler.
- * Metadata is only extracted if {@link
#setMetadataExtractionPatterns(Map)}
- * has been called to set patterns.
- */
+ @Override
public void parse(TikaInputStream tis, ContentHandler handler, Metadata
metadata,
ParseContext context) throws IOException, SAXException,
TikaException {
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata,
context);
-
- TemporaryResources tmp = new TemporaryResources();
- try {
- parse(tis, xhtml, metadata, tmp);
- } finally {
- tmp.dispose();
- }
- }
-
- private void parse(TikaInputStream tis, XHTMLContentHandler xhtml,
Metadata metadata,
- TemporaryResources tmp) throws IOException,
SAXException, TikaException {
- boolean inputToStdIn = true;
- boolean outputFromStdOut = true;
- boolean hasPatterns = (metadataPatterns != null &&
!metadataPatterns.isEmpty());
-
- File output = null;
-
- // Build our command
- String[] cmd;
- if (command.length == 1) {
- cmd = command[0].split(" ");
- } else {
- cmd = new String[command.length];
- System.arraycopy(command, 0, cmd, 0, command.length);
- }
- for (int i = 0; i < cmd.length; i++) {
- if (cmd[i].contains(INPUT_FILE_TOKEN)) {
- cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN,
tis.getFile().getPath());
- inputToStdIn = false;
- }
- if (cmd[i].contains(OUTPUT_FILE_TOKEN)) {
- output = tmp.createTemporaryFile();
- outputFromStdOut = false;
- cmd[i] = cmd[i].replace(OUTPUT_FILE_TOKEN, output.getPath());
+ try (TemporaryResources tmp = new TemporaryResources()) {
+ Path outFile = null;
+ Path p = tis.getPath();
+ List<String> thisCommandLine = new ArrayList<>();
+ Matcher inputMatcher = INPUT_TOKEN_MATCHER.matcher("");
+ Matcher outputMatcher = OUTPUT_TOKEN_MATCHER.matcher("");
+ boolean hasOutputFile = false;
+ for (String c : commandLine) {
+ if (inputMatcher.reset(c).find()) {
+ // ProcessBuilder uses argv arrays, not shell invocation,
+ // so no escaping is needed or desired here.
+ String updated = c.replace(INPUT_FILE_TOKEN,
+ p.toAbsolutePath().toString());
+ thisCommandLine.add(updated);
+ } else if (outputMatcher.reset(c).find()) {
+ outFile = tmp.createTempFile();
+ String updated = c.replace(OUTPUT_FILE_TOKEN,
+ outFile.toAbsolutePath().toString());
+ thisCommandLine.add(updated);
+ hasOutputFile = true;
+ } else {
+ thisCommandLine.add(c);
+ }
}
- }
- // Execute
- Process process = null;
- try {
- if (cmd.length == 1) {
- process = Runtime.getRuntime().exec(cmd[0]);
- } else {
- process = Runtime.getRuntime().exec(cmd);
+ // Always capture both stdout and stderr in memory
+ long localTimeoutMillis = TimeoutLimits.getProcessTimeoutMillis(
+ context, config.getTimeoutMs());
+ FileProcessResult result = ProcessUtils.execute(
+ new ProcessBuilder(thisCommandLine),
+ localTimeoutMillis, config.getMaxStdOut(),
config.getMaxStdErr());
+
+ // Set process metadata
+ metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout());
+ metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue());
+ TikaProgressTracker.update(context);
+ metadata.set(ExternalProcess.STD_OUT_LENGTH,
result.getStdoutLength());
+ metadata.set(ExternalProcess.STD_OUT_IS_TRUNCATED,
+ result.isStdoutTruncated());
+ metadata.set(ExternalProcess.STD_ERR_LENGTH,
result.getStderrLength());
+ metadata.set(ExternalProcess.STD_ERR_IS_TRUNCATED,
+ result.isStderrTruncated());
+
+ if (config.isReturnStdout()) {
+ metadata.set(ExternalProcess.STD_OUT, result.getStdout());
}
- } catch (Exception e) {
- LOG.warn("problem with process exec", e);
- }
-
- try {
- if (inputToStdIn) {
- sendInput(process, tis);
- } else {
- process.getOutputStream().close();
+ if (config.isReturnStderr()) {
+ metadata.set(ExternalProcess.STD_ERR, result.getStderr());
}
- InputStream out = process.getInputStream();
- InputStream err = process.getErrorStream();
+ // Determine content source
+ String effectiveContentSource = config.getContentSource();
+ if (effectiveContentSource == null) {
+ effectiveContentSource = hasOutputFile ? "outputFile" :
"stdout";
+ }
- if (hasPatterns) {
- extractMetadata(err, metadata);
+ XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata, context);
+ xhtml.startDocument();
- if (outputFromStdOut) {
- extractOutput(out, xhtml);
- } else {
- extractMetadata(out, metadata);
- }
- } else {
- ignoreStream(err);
+ // Process each stream through its handler
+ handleStream(result.getStdout(), stdoutHandler,
+ "stdout".equals(effectiveContentSource),
+ xhtml, metadata, context);
- if (outputFromStdOut) {
- extractOutput(out, xhtml);
- } else {
- ignoreStream(out);
- }
- }
- } finally {
- try {
- process.waitFor();
- } catch (InterruptedException ignore) {
- }
- }
+ handleStream(result.getStderr(), stderrHandler,
+ "stderr".equals(effectiveContentSource),
+ xhtml, metadata, context);
- // Grab the output if we haven't already
- if (!outputFromStdOut) {
- try (FileInputStream fileInputStream = new
FileInputStream(output)) {
- extractOutput(fileInputStream, xhtml);
+ if (hasOutputFile && outFile != null) {
+ handleOutputFile(outFile, outputFileHandler,
+ "outputFile".equals(effectiveContentSource),
+ xhtml, metadata, context);
}
- }
- }
- /**
- * Starts a thread that extracts the contents of the standard output
- * stream of the given process to the given XHTML content handler.
- * The standard output stream is closed once fully processed.
- *
- * @param stream
- * @param xhtml XHTML content handler
- * @throws SAXException if the XHTML SAX events could not be handled
- * @throws IOException if an input error occurred
- */
- private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
- throws SAXException, IOException {
- try (Reader reader = new InputStreamReader(stream, UTF_8)) {
- xhtml.startDocument();
- xhtml.startElement("p");
- char[] buffer = new char[1024];
- for (int n = reader.read(buffer); n != -1; n =
reader.read(buffer)) {
- xhtml.characters(buffer, 0, n);
- }
- xhtml.endElement("p");
xhtml.endDocument();
}
}
- /**
- * Starts a thread that sends the contents of the given input stream
- * to the standard input stream of the given process. Potential
- * exceptions are ignored, and the standard input stream is closed
- * once fully processed. Note that the given input stream is <em>not</em>
- * closed by this method.
- *
- * @param process process
- * @param stream input stream
- */
- private void sendInput(final Process process, final InputStream stream) {
- Thread t = new Thread(() -> {
- OutputStream stdin = process.getOutputStream();
- try {
- IOUtils.copy(stream, stdin);
- } catch (IOException e) {
- //swallow
+ private void handleStream(String content, Parser handler, boolean
isContentSource,
+ XHTMLContentHandler xhtml, Metadata metadata,
+ ParseContext context)
+ throws IOException, SAXException, TikaException {
+ if (content == null || content.isEmpty()) {
+ return;
+ }
+ if (handler != null) {
+ ContentHandler target = isContentSource ?
+ new BodyContentHandler(xhtml) : DISCARD_HANDLER;
+ try (TikaInputStream tis = TikaInputStream.get(
+ content.getBytes(StandardCharsets.UTF_8))) {
+ handler.parse(tis, target, metadata, context);
+ }
+ } else if (isContentSource) {
+ // No handler — write raw content as XHTML text
+ String[] lines = content.split("\n", -1);
+ for (int i = 0; i < lines.length; i++) {
+ xhtml.characters(lines[i]);
+ if (i < lines.length - 1) {
+ xhtml.newline();
+ }
}
- });
- t.start();
- try {
- t.join();
- } catch (InterruptedException ignore) {
}
}
- private void extractMetadata(final InputStream stream, final Metadata
metadata) {
- Thread t = new Thread(() -> {
- BufferedReader reader;
- reader = new BufferedReader(new InputStreamReader(stream, UTF_8));
- try {
- String line;
- while ((line = reader.readLine()) != null) {
- boolean consumed = false;
- for (Map.Entry<Pattern, String> entry :
metadataPatterns.entrySet()) {
- Matcher m = entry.getKey().matcher(line);
- if (m.find()) {
- consumed = true;
- if (entry.getValue() != null &&
- !entry.getValue().equals("")) {
- metadata.add(entry.getValue(), m.group(1));
- } else {
- metadata.add(m.group(1), m.group(2));
- }
- }
- }
- if (!consumed) {
- ignoredLineConsumer.consume(line);
- }
+ private void handleOutputFile(Path outFile, Parser handler,
+ boolean isContentSource,
+ XHTMLContentHandler xhtml, Metadata metadata,
+ ParseContext context)
+ throws IOException, SAXException, TikaException {
+ if (handler != null) {
+ ContentHandler target = isContentSource ?
+ new BodyContentHandler(xhtml) : DISCARD_HANDLER;
+ try (TikaInputStream tis = TikaInputStream.get(outFile)) {
+ handler.parse(tis, target, metadata, context);
+ }
+ } else if (isContentSource) {
+ // No handler — write raw file content as XHTML text
+ try (BufferedReader reader = Files.newBufferedReader(outFile)) {
+ String line = reader.readLine();
+ while (line != null) {
+ xhtml.characters(line);
+ xhtml.newline();
+ line = reader.readLine();
}
- } catch (IOException e) {
- // Ignore
- } finally {
- IOUtils.closeQuietly(reader);
- IOUtils.closeQuietly(stream);
}
- });
- t.start();
- try {
- t.join();
- } catch (InterruptedException ignore) {
}
}
/**
- * Consumer contract
- *
- * @since Apache Tika 1.14
+ * Returns the configuration for this parser.
*/
- public interface LineConsumer extends Serializable {
- /**
- * A null consumer
- */
- LineConsumer NULL = line -> {
- // ignores
- };
-
- /**
- * Consume a line
- *
- * @param line a line of string
- */
- void consume(String line);
+ public ExternalParserConfig getConfig() {
+ return config;
}
-
-
}
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParserConfig.java
b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParserConfig.java
similarity index 59%
rename from
tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParserConfig.java
rename to
tika-core/src/main/java/org/apache/tika/parser/external/ExternalParserConfig.java
index 913565af80..ac71a3ff59 100644
---
a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParserConfig.java
+++
b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParserConfig.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.external2;
+package org.apache.tika.parser.external;
import java.io.Serializable;
import java.util.ArrayList;
@@ -30,13 +30,18 @@ import org.apache.tika.parser.Parser;
*/
public class ExternalParserConfig implements Serializable {
- private static final long serialVersionUID = 1L;
+ private static final long serialVersionUID = 2L;
private List<String> supportedTypes = new ArrayList<>();
private List<String> commandLine = new ArrayList<>();
- private Parser outputParser;
+ private Parser stdoutHandler;
+ private Parser stderrHandler;
+ private Parser outputFileHandler;
+ private String contentSource;
+ private List<String> checkCommandLine;
+ private List<Integer> checkErrorCodes;
private boolean returnStdout = false;
- private boolean returnStderr = true;
+ private boolean returnStderr = false;
private long timeoutMs = ExternalParser.DEFAULT_TIMEOUT_MS;
private int maxStdErr = 10000;
private int maxStdOut = 10000;
@@ -60,12 +65,60 @@ public class ExternalParserConfig implements Serializable {
this.commandLine = commandLine;
}
- public Parser getOutputParser() {
- return outputParser;
+ public Parser getStdoutHandler() {
+ return stdoutHandler;
}
- public void setOutputParser(Parser outputParser) {
- this.outputParser = outputParser;
+ public void setStdoutHandler(Parser stdoutHandler) {
+ this.stdoutHandler = stdoutHandler;
+ }
+
+ public Parser getStderrHandler() {
+ return stderrHandler;
+ }
+
+ public void setStderrHandler(Parser stderrHandler) {
+ this.stderrHandler = stderrHandler;
+ }
+
+ public Parser getOutputFileHandler() {
+ return outputFileHandler;
+ }
+
+ public void setOutputFileHandler(Parser outputFileHandler) {
+ this.outputFileHandler = outputFileHandler;
+ }
+
+ /**
+ * Which stream provides the XHTML content output.
+ * <p>
+ * Valid values: {@code "stdout"}, {@code "stderr"}, {@code "outputFile"},
{@code "none"}.
+ * <p>
+ * If {@code null}, defaults to {@code "stdout"} when no {@code
${OUTPUT_FILE}} token
+ * is in the command, or {@code "outputFile"} when it is.
+ */
+ public String getContentSource() {
+ return contentSource;
+ }
+
+ public void setContentSource(String contentSource) {
+ this.contentSource = contentSource;
+ }
+
+ public List<String> getCheckCommandLine() {
+ return checkCommandLine;
+ }
+
+ public void setCheckCommandLine(List<String> checkCommandLine) {
+ this.checkCommandLine = checkCommandLine;
+ }
+
+ public List<Integer> getCheckErrorCodes() {
+ return checkErrorCodes;
+ }
+
+ public void setCheckErrorCodes(List<Integer> checkErrorCodes) {
+ this.checkErrorCodes = checkErrorCodes;
}
public boolean isReturnStdout() {
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
deleted file mode 100644
index 754bcf4454..0000000000
---
a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.external;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.StringTokenizer;
-import java.util.regex.Pattern;
-import javax.xml.parsers.DocumentBuilder;
-
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.utils.XMLReaderUtils;
-
-/**
- * Builds up ExternalParser instances based on XML file(s)
- * which define what to run, for what, and how to process
- * any output metadata.
- * Typically used to configure up a series of external programs
- * (like catdoc or pdf2txt) to extract text content from documents.
- *
- * <pre>
- * TODO XML DTD Here
- * </pre>
- */
-public final class ExternalParsersConfigReader implements
ExternalParsersConfigReaderMetKeys {
-
- public static List<ExternalParser> read(InputStream stream) throws
TikaException, IOException {
- try {
- DocumentBuilder builder = XMLReaderUtils.getDocumentBuilder();
- Document document = builder.parse(new InputSource(stream));
- return read(document);
- } catch (SAXException e) {
- throw new TikaException("Invalid parser configuration", e);
- }
- }
-
- public static List<ExternalParser> read(Document document) throws
TikaException, IOException {
- return read(document.getDocumentElement());
- }
-
- public static List<ExternalParser> read(Element element) throws
TikaException, IOException {
- List<ExternalParser> parsers = new ArrayList<>();
-
- if (element != null &&
element.getTagName().equals(EXTERNAL_PARSERS_TAG)) {
- NodeList nodes = element.getChildNodes();
- for (int i = 0; i < nodes.getLength(); i++) {
- Node node = nodes.item(i);
- if (node.getNodeType() == Node.ELEMENT_NODE) {
- Element child = (Element) node;
- if (child.getTagName().equals(PARSER_TAG)) {
- ExternalParser p = readParser(child);
- if (p != null) {
- parsers.add(p);
- }
- }
- }
- }
- } else {
- throw new MimeTypeException(
- "Not a <" + EXTERNAL_PARSERS_TAG + "/> configuration
document: " +
- (element != null ? element.getTagName() : "n/a"));
- }
-
- return parsers;
- }
-
- /**
- * Builds and Returns an ExternalParser, or null if a check
- * command was given that didn't match.
- */
- private static ExternalParser readParser(Element parserDef) throws
TikaException {
- ExternalParser parser = new ExternalParser();
-
- NodeList children = parserDef.getChildNodes();
- for (int i = 0; i < children.getLength(); i++) {
- Node node = children.item(i);
- if (node.getNodeType() == Node.ELEMENT_NODE) {
- Element child = (Element) node;
- switch (child.getTagName()) {
- case CHECK_TAG:
- boolean present = readCheckTagAndCheck(child);
- if (!present) {
- return null;
- }
- break;
- case COMMAND_TAG:
- parser.setCommand(getString(child));
- break;
- case MIMETYPES_TAG:
- parser.setSupportedTypes(readMimeTypes(child));
- break;
- case METADATA_TAG:
-
parser.setMetadataExtractionPatterns(readMetadataPatterns(child));
- break;
- default:
- throw new IllegalArgumentException("reaction not
defined for " + child.getTagName());
- }
- }
- }
-
- return parser;
- }
-
- private static Set<MediaType> readMimeTypes(Element mimeTypes) {
- Set<MediaType> types = new HashSet<>();
-
- NodeList children = mimeTypes.getChildNodes();
- for (int i = 0; i < children.getLength(); i++) {
- Node node = children.item(i);
- if (node.getNodeType() == Node.ELEMENT_NODE) {
- Element child = (Element) node;
- if (child.getTagName().equals(MIMETYPE_TAG)) {
- types.add(MediaType.parse(getString(child)));
- }
- }
- }
-
- return types;
- }
-
- private static Map<Pattern, String> readMetadataPatterns(Element
metadataDef) {
- Map<Pattern, String> metadata = new HashMap<>();
-
- NodeList children = metadataDef.getChildNodes();
- for (int i = 0; i < children.getLength(); i++) {
- Node node = children.item(i);
- if (node.getNodeType() == Node.ELEMENT_NODE) {
- Element child = (Element) node;
- if (child.getTagName().equals(METADATA_MATCH_TAG)) {
- String metadataKey = child.getAttribute(METADATA_KEY_ATTR);
- Pattern pattern = Pattern.compile(getString(child));
- metadata.put(pattern, metadataKey);
- }
- }
- }
-
- return metadata;
- }
-
- private static boolean readCheckTagAndCheck(Element checkDef) {
- String command = null;
- List<Integer> errorVals = new ArrayList<>();
-
- NodeList children = checkDef.getChildNodes();
- for (int i = 0; i < children.getLength(); i++) {
- Node node = children.item(i);
- if (node.getNodeType() == Node.ELEMENT_NODE) {
- Element child = (Element) node;
- if (child.getTagName().equals(COMMAND_TAG)) {
- command = getString(child);
- }
- if (child.getTagName().equals(ERROR_CODES_TAG)) {
- String errs = getString(child);
- StringTokenizer st = new StringTokenizer(errs, ",");
- while (st.hasMoreElements()) {
- try {
- String s = st.nextToken();
- errorVals.add(Integer.parseInt(s));
- } catch (NumberFormatException e) {
- //swallow
- }
- }
- }
- }
- }
-
- if (command != null) {
- String[] theCommand = command.split(" ");
- int[] errVals = new int[errorVals.size()];
- for (int i = 0; i < errVals.length; i++) {
- errVals[i] = errorVals.get(i);
- }
-
- return ExternalParser.check(theCommand, errVals);
- }
-
- // No check command, so assume it's there
- return true;
- }
-
- private static String getString(Element element) {
- StringBuilder s = new StringBuilder();
-
- NodeList children = element.getChildNodes();
- for (int i = 0; i < children.getLength(); i++) {
- Node node = children.item(i);
- if (node.getNodeType() == Node.TEXT_NODE) {
- s.append(node.getNodeValue());
- }
- }
-
- return s.toString();
- }
-}
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
deleted file mode 100644
index 86369c6cd7..0000000000
---
a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.external;
-
-/**
- * Met Keys used by the {@link ExternalParsersConfigReader}.
- */
-public interface ExternalParsersConfigReaderMetKeys {
-
- String EXTERNAL_PARSERS_TAG = "external-parsers";
-
- String PARSER_TAG = "parser";
-
- String COMMAND_TAG = "command";
-
- String CHECK_TAG = "check";
-
- String ERROR_CODES_TAG = "error-codes";
-
- String MIMETYPES_TAG = "mime-types";
-
- String MIMETYPE_TAG = "mime-type";
-
- String METADATA_TAG = "metadata";
-
- String METADATA_MATCH_TAG = "match";
-
- String METADATA_KEY_ATTR = "key";
-}
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
deleted file mode 100644
index 4822a79c08..0000000000
---
a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.external;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Enumeration;
-import java.util.List;
-
-import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.exception.TikaException;
-
-/**
- * Creates instances of ExternalParser based on XML
- * configuration files.
- *
- * @see ExternalParsersConfigReader
- */
-public class ExternalParsersFactory {
-
- public static List<ExternalParser> create() throws IOException,
TikaException {
- return create(new ServiceLoader());
- }
-
- public static List<ExternalParser> create(ServiceLoader loader)
- throws IOException, TikaException {
- return create("tika-external-parsers.xml", loader);
- }
-
- public static List<ExternalParser> create(String filename, ServiceLoader
loader)
- throws IOException, TikaException {
- String filepath =
-
ExternalParsersFactory.class.getPackage().getName().replace('.', '/') + "/" +
- filename;
- Enumeration<URL> files = loader.findServiceResources(filepath);
- ArrayList<URL> list = Collections.list(files);
- URL[] urls = list.toArray(new URL[0]);
- return create(urls);
- }
-
- public static List<ExternalParser> create(URL... urls) throws IOException,
TikaException {
- List<ExternalParser> parsers = new ArrayList<>();
- for (URL url : urls) {
- try (InputStream stream = url.openStream()) {
- parsers.addAll(ExternalParsersConfigReader.read(stream));
- }
- }
- return parsers;
- }
-}
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java
b/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java
deleted file mode 100644
index 4ee27b9d65..0000000000
--- a/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * External parser process.
- */
[email protected]("1.0.0")
-package org.apache.tika.parser.external;
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
b/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
deleted file mode 100644
index 1c87ecaf99..0000000000
---
a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.external2;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import org.apache.tika.config.ConfigDeserializer;
-import org.apache.tika.config.JsonConfig;
-import org.apache.tika.config.TikaComponent;
-import org.apache.tika.config.TikaProgressTracker;
-import org.apache.tika.config.TimeoutLimits;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.ExternalProcess;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.EmptyParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.utils.FileProcessResult;
-import org.apache.tika.utils.ProcessUtils;
-
-/**
- * This is a next generation external parser that uses some of the more
- * recent additions to Tika. This is an experimental alternative to the
- * {@link org.apache.tika.parser.external.ExternalParser}.
- * Specifically, it relies more on configuration than the SPI model.
- * Further, users can specify a parser to handle the output
- * of the external process.
- */
-@TikaComponent
-public class ExternalParser implements Parser {
-
- public static final long DEFAULT_TIMEOUT_MS = 60000;
-
- public static final String INPUT_FILE_TOKEN = "${INPUT_FILE}";
-
- public static final String OUTPUT_FILE_TOKEN = "${OUTPUT_FILE}";
-
- private static Pattern INPUT_TOKEN_MATCHER =
Pattern.compile("\\$\\{INPUT_FILE}");
- private static Pattern OUTPUT_TOKEN_MATCHER =
Pattern.compile("\\$\\{OUTPUT_FILE}");
-
- private static final Logger LOG =
LoggerFactory.getLogger(ExternalParser.class);
-
- private final ExternalParserConfig config;
-
- // Cached values derived from config
- private final Set<MediaType> supportedTypes;
- private final List<String> commandLine;
- private final Parser outputParser;
-
- /**
- * Default constructor - not typically useful since ExternalParser
requires configuration.
- */
- public ExternalParser() {
- this(new ExternalParserConfig());
- }
-
- /**
- * Programmatic constructor with typed config.
- */
- public ExternalParser(ExternalParserConfig config) {
- this.config = config;
- this.supportedTypes = new HashSet<>();
- for (String s : config.getSupportedTypes()) {
- this.supportedTypes.add(MediaType.parse(s));
- }
- this.commandLine = new ArrayList<>(config.getCommandLine());
- this.outputParser = config.getOutputParser() != null ?
- config.getOutputParser() : EmptyParser.INSTANCE;
- }
-
- /**
- * JSON config constructor - used for deserialization.
- */
- public ExternalParser(JsonConfig jsonConfig) {
- this(ConfigDeserializer.buildConfig(jsonConfig,
ExternalParserConfig.class));
- }
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return supportedTypes;
- }
-
- @Override
- public void parse(TikaInputStream tis, ContentHandler handler, Metadata
metadata,
- ParseContext context) throws IOException, SAXException,
TikaException {
- //this may remain null, depending on whether the external parser
writes to a file
- Path outFile = null;
- try (TemporaryResources tmp = new TemporaryResources()) {
- Path p = tis.getPath();
- List<String> thisCommandLine = new ArrayList<>();
- Matcher inputMatcher = INPUT_TOKEN_MATCHER.matcher("");
- Matcher outputMatcher = OUTPUT_TOKEN_MATCHER.matcher("");
- boolean outputFileInCommandline = false;
- for (String c : commandLine) {
- if (inputMatcher.reset(c).find()) {
- String updated = c.replace(INPUT_FILE_TOKEN,
-
ProcessUtils.escapeCommandLine(p.toAbsolutePath().toString()));
- thisCommandLine.add(updated);
- } else if (outputMatcher.reset(c).find()) {
- outFile = Files.createTempFile("tika-external2-", "");
- String updated = c.replace(OUTPUT_FILE_TOKEN,
-
ProcessUtils.escapeCommandLine(outFile.toAbsolutePath().toString()));
- thisCommandLine.add(updated);
- outputFileInCommandline = true;
- } else {
- thisCommandLine.add(c);
- }
- }
- FileProcessResult result = null;
- long localTimeoutMillis =
TimeoutLimits.getProcessTimeoutMillis(context, config.getTimeoutMs());
- if (outputFileInCommandline) {
- result = ProcessUtils.execute(new
ProcessBuilder(thisCommandLine),
- localTimeoutMillis, config.getMaxStdOut(),
config.getMaxStdErr());
- } else {
- outFile = Files.createTempFile("tika-external2-", "");
- result = ProcessUtils.execute(new
ProcessBuilder(thisCommandLine),
- localTimeoutMillis, outFile, config.getMaxStdErr());
- }
- metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout());
- metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue());
- TikaProgressTracker.update(context);
- metadata.set(ExternalProcess.STD_OUT_LENGTH,
result.getStdoutLength());
- metadata.set(ExternalProcess.STD_OUT_IS_TRUNCATED,
- result.isStdoutTruncated());
- metadata.set(ExternalProcess.STD_ERR_LENGTH,
result.getStderrLength());
- metadata.set(ExternalProcess.STD_ERR_IS_TRUNCATED,
- result.isStderrTruncated());
-
- if (config.isReturnStdout()) {
- metadata.set(ExternalProcess.STD_OUT, result.getStdout());
- }
- if (config.isReturnStderr()) {
- metadata.set(ExternalProcess.STD_ERR, result.getStderr());
- }
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata, context);
- xhtml.startDocument();
- handleOutput(result, outFile, xhtml, metadata, context);
- xhtml.endDocument();
- } finally {
- if (outFile != null) {
- Files.delete(outFile);
- }
- }
- }
-
- private void handleOutput(FileProcessResult result, Path outFile,
- XHTMLContentHandler xhtml, Metadata metadata,
- ParseContext parseContext) throws SAXException,
TikaException,
- IOException {
- if (outputParser == EmptyParser.INSTANCE) {
- if (outFile != null) {
- try (BufferedReader reader = Files.newBufferedReader(outFile))
{
- String line = reader.readLine();
- while (line != null) {
- //do we want to wrap this in <p></p> elements?
- xhtml.characters(line);
- xhtml.newline();
- line = reader.readLine();
- }
- }
- } else {
- //read this in line by line and wrap <p></p> elements?
- xhtml.characters(result.getStdout());
- }
- } else {
- if (outFile != null) {
- try (TikaInputStream tis = TikaInputStream.get(outFile)) {
- outputParser.parse(tis, new BodyContentHandler(xhtml),
metadata, parseContext);
- }
- } else {
- try (TikaInputStream tis = TikaInputStream.get(
- result.getStdout().getBytes(StandardCharsets.UTF_8))) {
- outputParser.parse(tis, new BodyContentHandler(xhtml),
metadata, parseContext);
- }
- }
- }
-
- }
-
- /**
- * Returns the output parser used to parse the external process output.
- */
- public Parser getOutputParser() {
- return outputParser;
- }
-
- /**
- * Returns the configuration for this parser.
- */
- public ExternalParserConfig getConfig() {
- return config;
- }
-}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java
b/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java
index 5ee5865fe1..eb983ec7de 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java
@@ -23,9 +23,14 @@ import java.nio.file.Path;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class ProcessUtils {
+ private static final Logger LOG =
LoggerFactory.getLogger(ProcessUtils.class);
private static final ConcurrentHashMap<String, Process> PROCESS_MAP = new
ConcurrentHashMap<>();
@@ -219,4 +224,73 @@ public class ProcessUtils {
}
+ /**
+ * Checks to see if the command can be run. Typically used with
+ * something like "myapp --version" to check to see if "myapp"
+ * is installed and on the path.
+ *
+ * @param checkCmd The check command to run
+ * @param errorValue What is considered an error value? Default is 127
(command not found).
+ * @return true if the command ran successfully (exit code not in
errorValue list)
+ */
+ public static boolean checkCommand(String checkCmd, int... errorValue) {
+ return checkCommand(new String[]{checkCmd}, errorValue);
+ }
+
+ /**
+ * Checks to see if the command can be run. Typically used with
+ * something like {@code new String[]{"myapp", "--version"}} to check to
see if "myapp"
+ * is installed and on the path.
+ *
+ * @param checkCmd The check command to run
+ * @param errorValue What is considered an error value? Default is 127
(command not found).
+ * @return true if the command ran successfully (exit code not in
errorValue list)
+ */
+ public static boolean checkCommand(String[] checkCmd, int... errorValue) {
+ if (errorValue.length == 0) {
+ errorValue = new int[]{127};
+ }
+
+ Process process = null;
+ try {
+ process = Runtime.getRuntime().exec(checkCmd);
+ StreamGobbler outGobbler = new
StreamGobbler(process.getInputStream(), 0);
+ StreamGobbler errGobbler = new
StreamGobbler(process.getErrorStream(), 0);
+ Thread outThread = new Thread(outGobbler);
+ Thread errThread = new Thread(errGobbler);
+ outThread.start();
+ errThread.start();
+ boolean finished = process.waitFor(60000, TimeUnit.MILLISECONDS);
+ if (!finished) {
+ throw new TimeoutException();
+ }
+ outThread.join(1000);
+ errThread.join(1000);
+ int result = process.exitValue();
+ LOG.debug("exit value for {}: {}", checkCmd[0], result);
+ for (int err : errorValue) {
+ if (result == err) {
+ return false;
+ }
+ }
+ return true;
+ } catch (IOException | InterruptedException | TimeoutException e) {
+ LOG.debug("exception trying to run " + checkCmd[0], e);
+ return false;
+ } catch (SecurityException se) {
+ throw se;
+ } catch (Error err) {
+ if (err.getMessage() != null &&
(err.getMessage().contains("posix_spawn") ||
+ err.getMessage().contains("UNIXProcess"))) {
+ LOG.debug("(TIKA-1526): exception trying to run: " +
checkCmd[0], err);
+ return false;
+ }
+ throw err;
+ } finally {
+ if (process != null) {
+ process.destroyForcibly();
+ }
+ }
+ }
+
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java
b/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java
index b2c0c7b210..7bcce46ac5 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java
@@ -20,12 +20,16 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
public class StreamGobbler implements Runnable {
+ // Maximum chars to buffer for a single line before truncating.
+ // Prevents OOM from a process outputting gigabytes without a newline.
+ private static final int MAX_LINE_LENGTH = 1_000_000;
private final InputStream is;
private final int maxBufferLength;
@@ -41,17 +45,17 @@ public class StreamGobbler implements Runnable {
@Override
public void run() {
-
- try (BufferedReader r = new BufferedReader(
+ try (Reader r = new BufferedReader(
new InputStreamReader(is, StandardCharsets.UTF_8))) {
- String line = r.readLine();
+ String line = readLineBounded(r);
while (line != null) {
if (maxBufferLength >= 0) {
if (streamLength + line.length() > maxBufferLength) {
int len = maxBufferLength - (int) streamLength;
if (len > 0) {
isTruncated = true;
- String truncatedLine = line.substring(0,
Math.min(line.length(), len));
+ String truncatedLine =
+ line.substring(0, Math.min(line.length(),
len));
lines.add(truncatedLine);
}
} else {
@@ -59,13 +63,51 @@ public class StreamGobbler implements Runnable {
}
}
streamLength += line.length();
- line = r.readLine();
+ line = readLineBounded(r);
}
} catch (IOException e) {
return;
}
}
+ /**
+ * Reads a line from the reader, capping at {@link #MAX_LINE_LENGTH} chars.
+ * Any remaining chars on the line are discarded. Returns null at EOF.
+ */
+ private String readLineBounded(Reader r) throws IOException {
+ StringBuilder sb = new StringBuilder();
+ boolean discarding = false;
+ int ch;
+ while ((ch = r.read()) != -1) {
+ if (ch == '\n') {
+ break;
+ }
+ if (ch == '\r') {
+ // peek for \r\n
+ r.mark(1);
+ int next = r.read();
+ if (next != '\n' && next != -1) {
+ r.reset();
+ }
+ break;
+ }
+ if (!discarding) {
+ if (sb.length() < MAX_LINE_LENGTH) {
+ sb.append((char) ch);
+ } else {
+ discarding = true;
+ isTruncated = true;
+ }
+ }
+ // When discarding, we still consume chars until newline/EOF
+ // to keep the stream position correct.
+ }
+ if (ch == -1 && sb.length() == 0) {
+ return null;
+ }
+ return sb.toString();
+ }
+
public List<String> getLines() {
return lines;
}
diff --git
a/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml
b/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml
deleted file mode 100644
index 9a1f356834..0000000000
---
a/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml
+++ /dev/null
@@ -1,117 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<!--
- Description: This xml file defines external commands to be run by Tika
- as parsers.
--->
-<external-parsers>
- <!-- This example uses ffmpeg for video metadata extraction -->
- <parser>
- <check>
- <command>ffmpeg -version</command>
- <error-codes>126,127</error-codes>
- </check>
- <command>ffmpeg -i ${INPUT}</command>
- <mime-types>
- <mime-type>video/avi</mime-type>
- <mime-type>video/mpeg</mime-type>
- <mime-type>video/x-msvideo</mime-type>
- </mime-types>
- <metadata>
- <match
key="xmpDM:audioSampleRate">\s*Stream.*:.+Audio:.*,\s+(\d+)\s+Hz,.*</match>
- <match
key="xmpDM:audioChannelType">\s*Stream.*:.+Audio:.*\d+\s+Hz,\s+(\d{1,2})\s+channels.*</match>
- <match
key="xmpDM:audioCompressor">\s*Stream.*:.+Audio:\s+([A-Za-z0-9_\(\)/\[\]
]+),.*</match>
- <match key="xmpDM:duration">\s*Duration:\s*([0-9:\.]+),.*</match>
- <match
key="xmpDM:fileDataRate">\s*Duration:.*,\s*bitrate:\s+([0-9A-Za-z/ ]+).*</match>
- <match
key="xmpDM:videoColorSpace">\s*Stream.*:\s+Video:\s+[A-Za-z0-9\(\)/
]+,\s+([A-Za-z0-9\(\) ,]+),\s+[0-9x]+,.*</match>
- <match
key="xmpDM:videoCompressor">\s*Stream.*:\s+Video:\s+([A-Za-z0-9\(\)/
]+),.*</match>
- <match
key="xmpDM:videoFrameRate">\s*Stream.*:\s+Video:.*,\s+([0-9]+)\s+fps,.*</match>
- <match key="encoder">\s*encoder\s*\:\s*(\w+).*</match>
- <match
key="videoResolution">\s*Stream.*:\s+Video:.*,\s+([0-9x]+),.*</match>
- </metadata>
- </parser>
- <parser>
- <check>
- <command>exiftool -ver</command>
- <error-codes>126,127</error-codes>
- </check>
- <command>env FOO=${OUTPUT} exiftool ${INPUT}</command>
- <mime-types>
- <mime-type>video/avi</mime-type>
- <mime-type>video/mpeg</mime-type>
- <mime-type>video/x-msvideo</mime-type>
- <mime-type>video/mp4</mime-type>
- </mime-types>
- <metadata>
- <match>\s*([A-Za-z0-9/ \(\)]+\S{1})\s+:\s+([A-Za-z0-9\(\)\[\]
\:\-\.]+)\s*</match>
- </metadata>
- </parser>
- <parser>
- <check>
- <command>sox --version</command>
- <error-codes>126,127</error-codes>
- </check>
- <command>env FOO=${OUTPUT} sox --info ${INPUT}</command>
- <mime-types>
- <mime-type>audio/3gpp</mime-type>
- <mime-type>audio/3gpp2</mime-type>
- <mime-type>audio/aac</mime-type>
- <mime-type>audio/ac3</mime-type>
- <mime-type>audio/basic</mime-type>
- <mime-type>audio/L24</mime-type>
- <mime-type>audio/mid</mime-type>
- <mime-type>audio/mpeg</mime-type>
- <mime-type>audio/mpeg3</mime-type>
- <mime-type>audio/x-mpeg-3</mime-type>
- <mime-type>audio/mpeg4-generic</mime-type>
- <mime-type>audio/mp4</mime-type>
- <mime-type>audio/mp3</mime-type>
- <mime-type>audio/x-aiff</mime-type>
- <mime-type>audio/PCMA</mime-type>
- <mime-type>audio/PCMA-WB</mime-type>
- <mime-type>audio/PCMU</mime-type>
- <mime-type>audio/PCMU-WB</mime-type>
- <mime-type>audio/ogg</mime-type>
- <mime-type>audio/vorbis</mime-type>
- <mime-type>audio/vnd.wav</mime-type>
- <mime-type>audio/vnd.wave</mime-type>
- <mime-type>audio/vnd.rn-realaudio</mime-type>
- <mime-type>audio/wav</mime-type>
- <mime-type>audio/wave</mime-type>
- <mime-type>audio/x-wav</mime-type>
- </mime-types>
- <metadata>
- <!-- Channels : 1 -->
- <match key="xmpDM:audioChannelType">\s*Channels.*:\s+(\d+)\s*</match>
- <!-- Sample Rate : 44100 -->
- <match key="xmpDM:audioSampleRate">\s*Sample Rate.*:\s+(\d+)\s*</match>
- <!-- Precision : 16-bit -->
- <match
key="xmpDM:audioSampleType">\s*Precision.*:\s+([\d\w-]+)\s*</match>
- <!-- Duration : 00:00:02.50 = 110298 samples = 187.582 CDDA
sectors -->
- <match key="xmpDM:duration">\s*Duration.*:\s+([\d:\.]+)\s*</match>
- <!-- File Size : 221k -->
- <match key="File Size">\s*File Size.*:\s+([\d\w]+)\s*</match>
- <!-- Bit Rate : 706k -->
- <match key="xmpDM:fileDataRate">\s*Bit Rate.*:\s+([\d\w]+)\s*</match>
- <!-- Sample Encoding: 16-bit Signed Integer PCM -->
- <match key="Sample Encoding">\s*Sample Encoding.*:\s+(.*)\s*</match>
- <!-- Comment : 'Comment=Processed by SoX' -->
- <match key="xmpDM:logComment">\s*Comment.*:\s+(.*)\s*</match>
- </metadata>
- </parser>
-</external-parsers>
diff --git
a/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java
b/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java
index 3b77375622..a8f49a2fbe 100644
---
a/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java
+++
b/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java
@@ -35,7 +35,6 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.utils.FileProcessResult;
import org.apache.tika.utils.ProcessUtils;
import org.apache.tika.utils.StringUtils;
@@ -170,7 +169,7 @@ public class SiegfriedDetector implements Detector {
public static boolean checkHasSiegfried(String siegfriedCommandPath) {
String[] commandline = new String[]{siegfriedCommandPath, "-version"};
- return ExternalParser.check(commandline);
+ return ProcessUtils.checkCommand(commandline);
}
/**
diff --git
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
index 6f1a071419..68b5712e8f 100644
---
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
+++
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
@@ -49,7 +49,6 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.FileProcessResult;
import org.apache.tika.utils.ProcessUtils;
@@ -152,7 +151,7 @@ public class GDALParser implements Parser {
private long timeoutMs = DEFAULT_TIMEOUT_MS;
public GDALParser() {
- setCommand("gdalinfo ${INPUT}");
+ setCommand("gdalinfo ${INPUT_FILE}");
}
public String getCommand() {
@@ -185,7 +184,7 @@ public class GDALParser implements Parser {
public void parse(TikaInputStream tis, ContentHandler handler, Metadata
metadata,
ParseContext context) throws IOException, SAXException,
TikaException {
- if (!ExternalParser.check("gdalinfo")) {
+ if (!ProcessUtils.checkCommand("gdalinfo")) {
return;
}
diff --git
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java
index e6a8b00f51..13fd87d021 100644
---
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java
+++
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java
@@ -28,8 +28,8 @@ import org.apache.tika.TikaTest;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.utils.ProcessUtils;
/**
* Test harness for the GDAL parser.
@@ -39,7 +39,7 @@ public class TestGDALParser extends TikaTest {
private boolean canRun() {
String[] checkCmd = {"gdalinfo"};
// If GDAL is not on the path, do not run the test.
- return ExternalParser.check(checkCmd);
+ return ProcessUtils.checkCommand(checkCmd);
}
@Test
@@ -139,7 +139,7 @@ public class TestGDALParser extends TikaTest {
// If the exit code is 1 (meaning FITS isn't supported by the
installed version of
// gdalinfo, don't run this test.
String[] fitsCommand = {"gdalinfo",
getResourceAsUrl(fitsFilename).getPath()};
- assumeTrue(ExternalParser.check(fitsCommand, 1));
+ assumeTrue(ProcessUtils.checkCommand(fitsCommand, 1));
String expectedAllgMin = "-7.319537E1";
String expectedAtodcorr = "COMPLETE";
diff --git
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/java/org/apache/tika/parser/scientific/integration/TestParsers.java
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/java/org/apache/tika/parser/scientific/integration/TestParsers.java
index cb2d2236b0..99e382c07d 100644
---
a/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/java/org/apache/tika/parser/scientific/integration/TestParsers.java
+++
b/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/java/org/apache/tika/parser/scientific/integration/TestParsers.java
@@ -30,10 +30,8 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.exception.TikaException;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.external.CompositeExternalParser;
import org.apache.tika.parser.ocr.TesseractOCRParser;
/**
@@ -58,11 +56,6 @@ public class TestParsers {
}
int checked = 0;
- //The initial lists were developed with exiftool installed. We have
since
- //modified the 2.4.1-* files to act as if no exiftool is installed.
- //However, on systems with ffmpeg or exiftool installed, we need
- //to override those file formats
- CompositeParser externalParser = (CompositeParser) new
CompositeExternalParser();
try (BufferedReader reader =
new BufferedReader(new InputStreamReader(
getClass().getResourceAsStream(path241),
@@ -73,10 +66,6 @@ public class TestParsers {
String mediaType = data[0];
String parserClass = data[1];
- Parser external =
externalParser.getParsers().get(MediaType.parse(mediaType));
- if (external != null) {
- parserClass = externalParser.getClass().toString();
- }
assertEquals(parserClass, currentDefault.get(mediaType),
"for mediaType '" + mediaType + "'");
checked++;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java
index 9459173d6e..9a1b47d6fe 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java
@@ -28,7 +28,7 @@ import org.slf4j.LoggerFactory;
import org.apache.tika.config.Initializable;
import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.utils.ProcessUtils;
import org.apache.tika.utils.StringUtils;
public class DWGParserConfig implements Serializable, Initializable {
@@ -62,7 +62,7 @@ public class DWGParserConfig implements Serializable,
Initializable {
// Try running DWGRead from there, and see if it exists + works
String[] checkCmd = { dwgRead };
- boolean hasDwgRead = ExternalParser.check(checkCmd);
+ boolean hasDwgRead = ProcessUtils.checkCommand(checkCmd);
LOG.debug("hasDwgRead (path: " + Arrays.toString(checkCmd) + "): " +
hasDwgRead);
return hasDwgRead;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
index 0354b15155..e8c9c2d848 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
@@ -39,8 +39,8 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.utils.ProcessUtils;
import org.apache.tika.utils.StringUtils;
public class DWGParserTest extends TikaTest {
@@ -53,7 +53,7 @@ public class DWGParserTest extends TikaTest {
// Try running DWGRead from there, and see if it exists + works
String[] checkCmd = { dwgRead };
- return ExternalParser.check(checkCmd);
+ return ProcessUtils.checkCommand(checkCmd);
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 2639b457ae..27cf577229 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -75,11 +75,11 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractExternalProcessParser;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.ProcessUtils;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
@@ -209,7 +209,7 @@ public class TesseractOCRParser extends
AbstractExternalProcessParser implements
// Try running Tesseract from there, and see if it exists + works
String[] checkCmd = {tesseract};
- boolean hasTesseract = ExternalParser.check(checkCmd);
+ boolean hasTesseract = ProcessUtils.checkCommand(checkCmd);
LOG.debug("hasTesseract (path: " + Arrays.toString(checkCmd) + "): " +
hasTesseract);
return hasTesseract;
}
@@ -231,7 +231,7 @@ public class TesseractOCRParser extends
AbstractExternalProcessParser implements
// Try running ImageMagick program from there, and see if it exists +
works
String[] checkCmd = {fullImageMagickPath};
- this.hasImageMagick = ExternalParser.check(checkCmd);
+ this.hasImageMagick = ProcessUtils.checkCommand(checkCmd);
if (!this.hasImageMagick) {
LOG.debug("ImageMagick does not appear to be installed " +
"(commandline: " +
fullImageMagickPath + ")");
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java
index 69316a74a8..213647eba6 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java
@@ -34,10 +34,10 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.renderer.PageBasedRenderResults;
import org.apache.tika.renderer.PageRangeRequest;
import org.apache.tika.renderer.RenderResult;
+import org.apache.tika.utils.ProcessUtils;
public class PopplerRendererTest {
@@ -45,7 +45,7 @@ public class PopplerRendererTest {
@BeforeAll
static void checkPoppler() {
- hasPoppler = ExternalParser.check(new String[]{"pdftoppm", "-v"});
+ hasPoppler = ProcessUtils.checkCommand(new String[]{"pdftoppm", "-v"});
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
index 903b764817..3be4f2fa29 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
@@ -26,8 +26,8 @@ import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.utils.ProcessUtils;
/**
* Test case for parsing unrar files.
@@ -40,7 +40,7 @@ public class UnrarParserTest extends AbstractPkgTest {
*/
@Test
public void testEncryptedRar() throws Exception {
- assumeTrue(ExternalParser.check("unrar"));
+ assumeTrue(ProcessUtils.checkCommand("unrar"));
Parser parser = new UnrarParser();
try (TikaInputStream tis =
getResourceAsStream("/test-documents/test-documents-enc.rar")) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
index cd863a718c..5d0ecc8023 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
@@ -49,8 +49,8 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.ProcessUtils;
import org.apache.tika.utils.SystemUtils;
/**
@@ -146,7 +146,7 @@ public class StringsParser implements Parser, Initializable
{
String[] checkCmd = {stringsProg, "--version"};
try {
- stringsPresent = ExternalParser.check(checkCmd);
+ stringsPresent = ProcessUtils.checkCommand(checkCmd);
if (!stringsPresent) {
return;
}
@@ -157,7 +157,7 @@ public class StringsParser implements Parser, Initializable
{
"/dev/null"};
int[] errorValues =
{1, 2}; // Exit status code: 1 = general error; 2 =
incorrect usage.
- hasEncodingOption = ExternalParser.check(checkOpt,
errorValues);
+ hasEncodingOption = ProcessUtils.checkCommand(checkOpt,
errorValues);
}
} catch (NoClassDefFoundError ncdfe) {
// This happens under OSGi + Fork Parser - see TIKA-1507
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
index 76372396ad..9a08aebc59 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
@@ -29,13 +29,13 @@ import org.apache.tika.TikaTest;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.utils.ProcessUtils;
public class StringsParserTest extends TikaTest {
public static boolean canRun() {
String[] checkCmd = {new
StringsParser().getDefaultConfig().getStringsPath() + getStringsProg(),
"--version"};
- return ExternalParser.check(checkCmd);
+ return ProcessUtils.checkCommand(checkCmd);
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
deleted file mode 100644
index 37f87a4595..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-org.apache.tika.parser.external.CompositeExternalParser
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index 8f9b957e90..dea1a9bc09 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -43,7 +43,6 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMPDM;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.external.CompositeExternalParser;
import org.apache.tika.parser.ogg.FlacParser;
import org.apache.tika.parser.ogg.OpusParser;
import org.apache.tika.parser.ogg.VorbisParser;
@@ -407,11 +406,6 @@ public class AutoDetectParserTest extends TikaTest {
}
}
- @Test
- public void testExternalParserIsLoaded() {
- Parser p = find((CompositeParser) AUTO_DETECT_PARSER,
CompositeExternalParser.class);
- assertNotNull(p);
- }
@Test
public void testWriteLimit() throws Exception {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index fa160184b8..8765905ecb 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -61,13 +61,13 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.image.ImageMetadataExtractor;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.parser.xml.XMLProfiler;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.utils.ProcessUtils;
import org.apache.tika.utils.StringUtils;
public class PDFParserTest extends TikaTest {
@@ -94,7 +94,7 @@ public class PDFParserTest extends TikaTest {
if (hasPoppler != null) {
return hasPoppler;
}
- hasPoppler = ExternalParser.check(new String[]{"pdftoppm", "-v"});
+ hasPoppler = ProcessUtils.checkCommand(new String[]{"pdftoppm", "-v"});
return hasPoppler;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
index 8c2aea0b7f..c671e01049 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
@@ -27,7 +27,7 @@ import org.apache.tika.TikaLoaderHelper;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.utils.ProcessUtils;
/**
@@ -41,7 +41,7 @@ public class UnrarParserTest extends AbstractPkgTest {
*/
@Test
public void testEmbedded() throws Exception {
- assumeTrue(ExternalParser.check("unrar"));
+ assumeTrue(ProcessUtils.checkCommand("unrar"));
// Expected embedded resources in test-documents.rar file.
String[] expectedResources = { "testHTML.html", "testEXCEL.xls",
"testOpenOffice2.odt", "testPDF.pdf",
diff --git
a/tika-serialization/src/test/java/org/apache/tika/parser/external/ExternalParserTest.java
b/tika-serialization/src/test/java/org/apache/tika/parser/external/ExternalParserTest.java
new file mode 100644
index 0000000000..e8c9293d57
--- /dev/null
+++
b/tika-serialization/src/test/java/org/apache/tika/parser/external/ExternalParserTest.java
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.external;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
+
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.loader.TikaLoader;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.parser.RegexCaptureParser;
+import org.apache.tika.parser.RegexCaptureParserConfig;
+
+public class ExternalParserTest extends TikaTest {
+
+ @Test
+ public void testConfigRegexCaptureParser() throws Exception {
+ assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand(new
String[]{
+ "file", "--version"
+ }));
+ TikaLoader loader = TikaLoader.load(getConfigPath(getClass(),
"TIKA-3557.json"));
+ CompositeParser p = (CompositeParser) loader.get(Parser.class);
+ assertEquals(1, p.getAllComponentParsers().size());
+ Parser parser = p.getAllComponentParsers().get(0);
+ ExternalParser externalParser = (parser instanceof ParserDecorator)
+ ? (ExternalParser) ((ParserDecorator)
parser).getWrappedParser()
+ : (ExternalParser) parser;
+
+ Parser stdoutHandler = externalParser.getConfig().getStdoutHandler();
+ assertEquals(RegexCaptureParser.class, stdoutHandler.getClass());
+
+ Metadata m = new Metadata();
+ ContentHandler contentHandler = new DefaultHandler();
+ String output = "Something\n" +
+ "Title: the quick brown fox\n" +
+ "Author: jumped over\n" +
+ "Created: 10/20/2024";
+ try (TikaInputStream tis =
TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) {
+ stdoutHandler.parse(tis, contentHandler, m, new ParseContext());
+ }
+ assertEquals("the quick brown fox", m.get("title"));
+ }
+
+ @Test
+ public void testConfigBasic() throws Exception {
+ assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand(
+ new String[]{"file", "--version"}));
+ TikaLoader loader = TikaLoader.load(
+ getConfigPath(getClass(), "TIKA-3557-no-output-parser.json"));
+ CompositeParser p = (CompositeParser) loader.get(Parser.class);
+ assertEquals(1, p.getAllComponentParsers().size());
+ Parser parser = p.getAllComponentParsers().get(0);
+ ExternalParser externalParser = (parser instanceof ParserDecorator)
+ ? (ExternalParser) ((ParserDecorator)
parser).getWrappedParser()
+ : (ExternalParser) parser;
+
+ // No handler — raw stdout becomes content (default
contentSource=stdout)
+ assertNull(externalParser.getConfig().getStdoutHandler());
+ XMLResult xmlResult = getXML("example.xml", externalParser);
+ assertContains("text/xml", xmlResult.xml);
+ }
+
+ @Test
+ public void testExifTool() throws Exception {
+ assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand(
+ new String[]{"exiftool", "-ver"}));
+ TikaLoader loader = TikaLoader.load(
+ getConfigPath(getClass(), "TIKA-3557-exiftool-example.json"));
+ Parser p = loader.loadAutoDetectParser();
+ List<Metadata> metadataList =
getRecursiveMetadata("testOverlappingText.pdf", p);
+ assertEquals(1, metadataList.size());
+ Metadata m = metadataList.get(0);
+ assertEquals("application/pdf", m.get("mime"));
+ assertEquals("1", m.get("pages"));
+ assertEquals("1.4", m.get("pdf:version"));
+ }
+
+ @Test
+ public void testFfmpegConfig() throws Exception {
+ TikaLoader loader = TikaLoader.load(
+ getConfigPath(getClass(), "external-parser-ffmpeg.json"));
+ CompositeParser p = (CompositeParser) loader.get(Parser.class);
+ assertEquals(1, p.getAllComponentParsers().size());
+ Parser parser = p.getAllComponentParsers().get(0);
+ ExternalParser externalParser = (parser instanceof ParserDecorator)
+ ? (ExternalParser) ((ParserDecorator)
parser).getWrappedParser()
+ : (ExternalParser) parser;
+
+ ExternalParserConfig config = externalParser.getConfig();
+ assertNotNull(config.getCheckCommandLine());
+ assertEquals(List.of("ffmpeg", "-version"),
config.getCheckCommandLine());
+ assertEquals(List.of(126, 127), config.getCheckErrorCodes());
+ assertNotNull(config.getStderrHandler());
+ assertEquals(RegexCaptureParser.class,
config.getStderrHandler().getClass());
+ assertEquals("none", config.getContentSource());
+ assertTrue(config.isReturnStderr());
+ }
+
+ @Test
+ public void testSoxConfig() throws Exception {
+ TikaLoader loader = TikaLoader.load(
+ getConfigPath(getClass(), "external-parser-sox.json"));
+ CompositeParser p = (CompositeParser) loader.get(Parser.class);
+ assertEquals(1, p.getAllComponentParsers().size());
+ Parser parser = p.getAllComponentParsers().get(0);
+ ExternalParser externalParser = (parser instanceof ParserDecorator)
+ ? (ExternalParser) ((ParserDecorator)
parser).getWrappedParser()
+ : (ExternalParser) parser;
+
+ ExternalParserConfig config = externalParser.getConfig();
+ assertNotNull(config.getCheckCommandLine());
+ assertEquals(List.of("sox", "--version"),
config.getCheckCommandLine());
+ assertNotNull(config.getStderrHandler());
+ assertEquals(RegexCaptureParser.class,
config.getStderrHandler().getClass());
+ }
+
+ @Test
+ public void testStderrHandlerExtractsMetadata() throws Exception {
+ String ffmpegStderr = " Duration: 00:02:30.50, start: 0.000000,
bitrate: 706 kb/s\n" +
+ " Stream #0:0: Video: h264 (High), yuv420p, 1280x720, 25
fps\n" +
+ " Stream #0:1: Audio: aac, 44100 Hz, 2 channels, fltp\n";
+
+ java.util.Map<String, String> captureMap = new
java.util.LinkedHashMap<>();
+ captureMap.put("xmpDM:duration", "\\s*Duration:\\s*([0-9:\\.]+),.*");
+ captureMap.put("xmpDM:audioSampleRate",
+ "\\s*Stream.*:.+Audio:.*,\\s+(\\d+)\\s+Hz,.*");
+
+ RegexCaptureParserConfig regexConfig = new RegexCaptureParserConfig();
+ regexConfig.setCaptureMap(captureMap);
+ RegexCaptureParser parser = new RegexCaptureParser(regexConfig);
+
+ Metadata m = new Metadata();
+ try (TikaInputStream tis = TikaInputStream.get(
+ ffmpegStderr.getBytes(StandardCharsets.UTF_8))) {
+ parser.parse(tis, new DefaultHandler(), m, new ParseContext());
+ }
+ assertEquals("00:02:30.50", m.get("xmpDM:duration"));
+ assertEquals("44100", m.get("xmpDM:audioSampleRate"));
+ }
+
+ @Test
+ public void testMultiExternalParsers() throws Exception {
+ assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand(
+ new String[]{"exiftool", "-ver"}));
+ assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand(
+ new String[]{"ffmpeg", "-version"}));
+
+ TikaLoader loader = TikaLoader.load(
+ getConfigPath(getClass(), "external-parser-multi.json"));
+ CompositeParser composite = (CompositeParser) loader.get(Parser.class);
+ List<Parser> allParsers = composite.getAllComponentParsers();
+
+ assertEquals(2, allParsers.size(), "Expected 2 external parsers but
got " +
+ allParsers.size() + ": " + allParsers);
+
+ // Test the exiftool parser on a PDF
+ Parser autoDetect = loader.loadAutoDetectParser();
+ List<Metadata> metadataList =
getRecursiveMetadata("testOverlappingText.pdf",
+ autoDetect);
+ assertEquals(1, metadataList.size());
+ Metadata m = metadataList.get(0);
+ assertEquals("application/pdf", m.get("exiftool:MIMEType"));
+ assertNotNull(m.get("exiftool:PageCount"));
+ assertNotNull(m.get("exiftool:PDFVersion"));
+ }
+}
diff --git
a/tika-serialization/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java
b/tika-serialization/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java
deleted file mode 100644
index 35e2a7a898..0000000000
---
a/tika-serialization/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.external2;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assumptions.assumeTrue;
-
-import java.nio.charset.StandardCharsets;
-import java.util.List;
-
-import org.junit.jupiter.api.Test;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.config.loader.TikaLoader;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.CompositeParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.ParserDecorator;
-import org.apache.tika.parser.RegexCaptureParser;
-
-public class ExternalParserTest extends TikaTest {
-
- @Test
- public void testConfigRegexCaptureParser() throws Exception {
- assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new
String[]{
- "file", "--version"
- }));
- TikaLoader loader = TikaLoader.load(getConfigPath(getClass(),
"TIKA-3557.json"));
- CompositeParser p = (CompositeParser) loader.get(Parser.class);
- assertEquals(1, p.getAllComponentParsers().size());
- Parser parser = p.getAllComponentParsers().get(0);
- // When _mime-include is used, the parser is wrapped in a
ParserDecorator
- ExternalParser externalParser = (parser instanceof ParserDecorator)
- ? (ExternalParser) ((ParserDecorator)
parser).getWrappedParser()
- : (ExternalParser) parser;
-
- Parser outputParser = externalParser.getOutputParser();
- assertEquals(RegexCaptureParser.class, outputParser.getClass());
-
- Metadata m = new Metadata();
- ContentHandler contentHandler = new DefaultHandler();
- String output = "Something\n" +
- "Title: the quick brown fox\n" +
- "Author: jumped over\n" +
- "Created: 10/20/2024";
- try (TikaInputStream tis =
TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) {
- outputParser.parse(tis, contentHandler, m, new ParseContext());
- }
- assertEquals("the quick brown fox", m.get("title"));
- }
-
- @Test
- public void testConfigBasic() throws Exception {
- assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new
String[]{"file", "--version"}));
- TikaLoader loader = TikaLoader.load(getConfigPath(getClass(),
"TIKA-3557-no-output-parser.json"));
- CompositeParser p = (CompositeParser) loader.get(Parser.class);
- assertEquals(1, p.getAllComponentParsers().size());
- Parser parser = p.getAllComponentParsers().get(0);
- // When _mime-include is used, the parser is wrapped in a
ParserDecorator
- ExternalParser externalParser = (parser instanceof ParserDecorator)
- ? (ExternalParser) ((ParserDecorator)
parser).getWrappedParser()
- : (ExternalParser) parser;
-
- XMLResult xmlResult = getXML("example.xml", externalParser);
- assertContains("<body>text/xml</body>",
xmlResult.xml.replaceAll("[\r\n]", ""));
- }
-
- @Test
- public void testExifTool() throws Exception {
- assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new
String[]{"exiftool",
- "-ver"}));
- TikaLoader loader = TikaLoader.load(getConfigPath(getClass(),
"TIKA-3557-exiftool-example.json"));
- Parser p = loader.loadAutoDetectParser();
- //this was the smallest pdf we had
- List<Metadata> metadataList =
getRecursiveMetadata("testOverlappingText.pdf", p);
- assertEquals(1, metadataList.size());
- Metadata m = metadataList.get(0);
- assertEquals("application/pdf", m.get("mime"));
- assertEquals("1", m.get("pages"));
- assertEquals("1.4", m.get("pdf:version"));
- }
-}
diff --git
a/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json
b/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json
index 683d6b0942..73ed0ba1ef 100644
---
a/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json
+++
b/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json
@@ -4,10 +4,10 @@
"external-parser": {
"_mime-include": ["application/octet-stream"],
"commandLine": ["exiftool", "${INPUT_FILE}"],
- "checkExitValues": [0],
- "outputParser": {
+ "contentSource": "none",
+ "stdoutHandler": {
"regex-capture-parser": {
- "matchMap": {
+ "captureMap": {
"mime": "^MIME Type\\s+: ([^\\r\\n]+)",
"pages": "^Page Count\\s+: ([^\\r\\n]+)",
"pdf:version": "^PDF Version\\s+: ([^\\r\\n]+)"
diff --git a/tika-serialization/src/test/resources/configs/TIKA-3557.json
b/tika-serialization/src/test/resources/configs/TIKA-3557.json
index cd3af89821..a5937bedf6 100644
--- a/tika-serialization/src/test/resources/configs/TIKA-3557.json
+++ b/tika-serialization/src/test/resources/configs/TIKA-3557.json
@@ -4,7 +4,7 @@
"external-parser": {
"_mime-include": ["application/xml"],
"commandLine": ["file", "-b", "--mime-type", "${INPUT_FILE}"],
- "outputParser": {
+ "stdoutHandler": {
"regex-capture-parser": {
"captureMap": {
"title": "^Title: ([^\\r\\n]+)"
diff --git
a/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json
b/tika-serialization/src/test/resources/configs/external-parser-exiftool.json
similarity index 54%
copy from
tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json
copy to
tika-serialization/src/test/resources/configs/external-parser-exiftool.json
index 683d6b0942..fbbed46e68 100644
---
a/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json
+++
b/tika-serialization/src/test/resources/configs/external-parser-exiftool.json
@@ -2,12 +2,19 @@
"parsers": [
{
"external-parser": {
- "_mime-include": ["application/octet-stream"],
+ "_mime-include": [
+ "video/avi",
+ "video/mpeg",
+ "video/x-msvideo",
+ "video/mp4"
+ ],
"commandLine": ["exiftool", "${INPUT_FILE}"],
- "checkExitValues": [0],
- "outputParser": {
+ "checkCommandLine": ["exiftool", "-ver"],
+ "checkErrorCodes": [126, 127],
+ "contentSource": "none",
+ "stdoutHandler": {
"regex-capture-parser": {
- "matchMap": {
+ "captureMap": {
"mime": "^MIME Type\\s+: ([^\\r\\n]+)",
"pages": "^Page Count\\s+: ([^\\r\\n]+)",
"pdf:version": "^PDF Version\\s+: ([^\\r\\n]+)"
diff --git
a/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json
b/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json
new file mode 100644
index 0000000000..3d9dd70bba
--- /dev/null
+++ b/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json
@@ -0,0 +1,35 @@
+{
+ "parsers": [
+ {
+ "external-parser": {
+ "_mime-include": [
+ "video/avi",
+ "video/mpeg",
+ "video/x-msvideo"
+ ],
+ "commandLine": ["ffmpeg", "-i", "${INPUT_FILE}"],
+ "checkCommandLine": ["ffmpeg", "-version"],
+ "checkErrorCodes": [126, 127],
+ "contentSource": "none",
+ "returnStderr": true,
+ "maxStdErr": 20000,
+ "stderrHandler": {
+ "regex-capture-parser": {
+ "captureMap": {
+ "xmpDM:audioSampleRate":
"\\s*Stream.*:.+Audio:.*,\\s+(\\d+)\\s+Hz,.*",
+ "xmpDM:audioChannelType":
"\\s*Stream.*:.+Audio:.*\\d+\\s+Hz,\\s+(\\d{1,2})\\s+channels.*",
+ "xmpDM:audioCompressor":
"\\s*Stream.*:.+Audio:\\s+([A-Za-z0-9_\\(\\)/\\[\\] ]+),.*",
+ "xmpDM:duration": "\\s*Duration:\\s*([0-9:\\.]+),.*",
+ "xmpDM:fileDataRate":
"\\s*Duration:.*,\\s*bitrate:\\s+([0-9A-Za-z/ ]+).*",
+ "xmpDM:videoColorSpace":
"\\s*Stream.*:\\s+Video:\\s+[A-Za-z0-9\\(\\)/ ]+,\\s+([A-Za-z0-9\\(\\)
,]+),\\s+[0-9x]+,.*",
+ "xmpDM:videoCompressor":
"\\s*Stream.*:\\s+Video:\\s+([A-Za-z0-9\\(\\)/ ]+),.*",
+ "xmpDM:videoFrameRate":
"\\s*Stream.*:\\s+Video:.*,\\s+([0-9]+)\\s+fps,.*",
+ "encoder": "\\s*encoder\\s*\\:\\s*(\\w+).*",
+ "videoResolution": "\\s*Stream.*:\\s+Video:.*,\\s+([0-9x]+),.*"
+ }
+ }
+ }
+ }
+ }
+ ]
+}
diff --git
a/tika-serialization/src/test/resources/configs/external-parser-multi.json
b/tika-serialization/src/test/resources/configs/external-parser-multi.json
new file mode 100644
index 0000000000..cce81a55fe
--- /dev/null
+++ b/tika-serialization/src/test/resources/configs/external-parser-multi.json
@@ -0,0 +1,47 @@
+{
+ "parsers": [
+ {
+ "external-parser": {
+ "_mime-include": [
+ "video/avi",
+ "video/mpeg",
+ "video/x-msvideo"
+ ],
+ "commandLine": ["ffmpeg", "-i", "${INPUT_FILE}"],
+ "checkCommandLine": ["ffmpeg", "-version"],
+ "checkErrorCodes": [126, 127],
+ "returnStderr": true,
+ "maxStdErr": 20000,
+ "contentSource": "none",
+ "stderrHandler": {
+ "regex-capture-parser": {
+ "captureMap": {
+ "xmpDM:duration": "\\s*Duration:\\s*([0-9:\\.]+),.*",
+ "xmpDM:audioSampleRate":
"\\s*Stream.*:.+Audio:.*,\\s+(\\d+)\\s+Hz,.*"
+ }
+ }
+ }
+ }
+ },
+ {
+ "external-parser": {
+ "_mime-include": [
+ "application/pdf"
+ ],
+ "commandLine": ["exiftool", "${INPUT_FILE}"],
+ "checkCommandLine": ["exiftool", "-ver"],
+ "checkErrorCodes": [126, 127],
+ "contentSource": "none",
+ "stdoutHandler": {
+ "regex-capture-parser": {
+ "captureMap": {
+ "exiftool:MIMEType": "^MIME Type\\s+: ([^\\r\\n]+)",
+ "exiftool:PageCount": "^Page Count\\s+: ([^\\r\\n]+)",
+ "exiftool:PDFVersion": "^PDF Version\\s+: ([^\\r\\n]+)"
+ }
+ }
+ }
+ }
+ }
+ ]
+}
diff --git
a/tika-serialization/src/test/resources/configs/external-parser-sox.json
b/tika-serialization/src/test/resources/configs/external-parser-sox.json
new file mode 100644
index 0000000000..39ac93c334
--- /dev/null
+++ b/tika-serialization/src/test/resources/configs/external-parser-sox.json
@@ -0,0 +1,37 @@
+{
+ "parsers": [
+ {
+ "external-parser": {
+ "_mime-include": [
+ "audio/mpeg",
+ "audio/mp3",
+ "audio/wav",
+ "audio/x-wav",
+ "audio/ogg",
+ "audio/vorbis",
+ "audio/mp4"
+ ],
+ "commandLine": ["sox", "--info", "${INPUT_FILE}"],
+ "checkCommandLine": ["sox", "--version"],
+ "checkErrorCodes": [126, 127],
+ "returnStderr": true,
+ "maxStdErr": 10000,
+ "contentSource": "none",
+ "stderrHandler": {
+ "regex-capture-parser": {
+ "captureMap": {
+ "xmpDM:audioChannelType": "\\s*Channels.*:\\s+(\\d+)\\s*",
+ "xmpDM:audioSampleRate": "\\s*Sample Rate.*:\\s+(\\d+)\\s*",
+ "xmpDM:audioSampleType": "\\s*Precision.*:\\s+([\\d\\w-]+)\\s*",
+ "xmpDM:duration": "\\s*Duration.*:\\s+([\\d:\\.]+)\\s*",
+ "File Size": "\\s*File Size.*:\\s+([\\d\\w]+)\\s*",
+ "xmpDM:fileDataRate": "\\s*Bit Rate.*:\\s+([\\d\\w]+)\\s*",
+ "Sample Encoding": "\\s*Sample Encoding.*:\\s+(.*)\\s*",
+ "xmpDM:logComment": "\\s*Comment.*:\\s+(.*)\\s*"
+ }
+ }
+ }
+ }
+ }
+ ]
+}
diff --git
a/tika-server/tika-server-standard/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
b/tika-server/tika-server-standard/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
deleted file mode 100644
index 37f87a4595..0000000000
---
a/tika-server/tika-server-standard/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-org.apache.tika.parser.external.CompositeExternalParser
\ No newline at end of file
diff --git
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaParsersTest.java
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaParsersTest.java
index 1eae7ba45a..e437aa4823 100644
---
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaParsersTest.java
+++
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaParsersTest.java
@@ -144,7 +144,7 @@ public class TikaParsersTest extends CXFTestBase {
assertEquals("org.apache.tika.parser.CompositeParser",
json.get("name"));
assertEquals(Boolean.TRUE, json.get("composite"));
- // At least 20 child parsers which aren't composite, except for
CompositeExternalParser
+ // At least 20 child parsers which aren't composite
List<Object> wrapper = (List) json.get("children");
Map<String, Object> firstItem = (Map) wrapper.get(0);
List<Object> children = (List) firstItem.get("children");
@@ -191,7 +191,7 @@ public class TikaParsersTest extends CXFTestBase {
assertEquals(true, hasOOXML);
assertEquals(true, hasZip);
assertTrue(nonComposite > 20);
- assertTrue(composite == 0 || composite == 1); // if
CompositeExternalParser is available it will be 1
+ assertEquals(0, composite);
}
}
}