This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_3x in repository https://gitbox.apache.org/repos/asf/tika.git
commit f40dda9d5a7c933ac8c79ff32d73de98ae428614 Author: Tim Allison <[email protected]> AuthorDate: Wed Dec 4 16:12:22 2024 -0500 TIKA-4355 -- LibPstParserConfig should be serializable (#2060) * TIKA-4355 -- LibPstParserConfig should be serializable * TIKA-4355 -- LibPstParser should allow the path to readpst to be configurable (cherry picked from commit 92053ea71cc58a51ff2389f4fbaf92262c1fa088) --- .../tika/parser/microsoft/libpst/LibPstParser.java | 47 ++++++++++++++++++---- .../microsoft/libpst/LibPstParserConfig.java | 4 +- .../parser/microsoft/libpst/TestLibPstParser.java | 3 +- .../parser/fork/ForkParserIntegrationTest.java | 20 +++++++++ .../test/resources/configs/tika-config-lib-pst.xml | 26 ++++++++++++ 5 files changed, 91 insertions(+), 9 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java index 6cf3e249c..b34a9e3e1 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -47,6 +48,7 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.FileProcessResult; import org.apache.tika.utils.ProcessUtils; +import org.apache.tika.utils.StringUtils; /** * This is an optional PST parser that relies on the user installing @@ -65,8 +67,10 @@ public class LibPstParser implements Parser, Initializable { private static final int MAX_STDERR = 10000; private static final String READ_PST_COMMAND = "readpst"; - private LibPstParserConfig defaultConfig = new LibPstParserConfig(); - + private final LibPstParserConfig defaultConfig = new LibPstParserConfig(); + //for security purposes, this cannot be set via the parseContext. This must + //be set via the usual @Field setters in tika-config.xml + private String readPstPath = ""; @Override public Set<MediaType> getSupportedTypes(ParseContext parseContext) { return SUPPORTED; @@ -125,9 +129,10 @@ public class LibPstParser implements Parser, Initializable { Files.walkFileTree(outDir, new EmailVisitor(outDir, config.isProcessEmailAsMsg(), xhtml, metadata, parseContext)); } - private ProcessBuilder getProcessBuilder(Path pst, LibPstParserConfig config, Path outDir, Path debugFile) { + private ProcessBuilder getProcessBuilder(Path pst, LibPstParserConfig config, Path outDir, Path debugFile) + throws TikaConfigException { List commands = new ArrayList<String>(); - commands.add(READ_PST_COMMAND); + commands.add(getFullReadPstCommand()); if (config.isDebug()) { commands.add("-d"); commands.add(ProcessUtils.escapeCommandLine(debugFile @@ -157,6 +162,13 @@ public class LibPstParser implements Parser, Initializable { @Override public void initialize(Map<String, Param> map) throws TikaConfigException { + if (readPstPath.contains("\u0000")) { + throw new TikaConfigException("path can't include null values"); + } + String fullReadPstCommand = getFullReadPstCommand(); + if (! Files.isRegularFile(Paths.get(fullReadPstCommand))) { + throw new TikaConfigException("I regret I can't find the readpst executable: " + fullReadPstCommand); + } try { check(); } catch (IOException e) { @@ -171,8 +183,10 @@ public class LibPstParser implements Parser, Initializable { } //throws exception if readpst is not available - private static void check() throws TikaConfigException, IOException { - ProcessBuilder pb = new ProcessBuilder(READ_PST_COMMAND, "-V"); + private void check() throws TikaConfigException, IOException { + String fullReadPstCommand = getFullReadPstCommand(); + + ProcessBuilder pb = new ProcessBuilder(ProcessUtils.escapeCommandLine(fullReadPstCommand), "-V"); FileProcessResult result = ProcessUtils.execute(pb, 30000, 10000, 10000); if (result.getExitValue() != 0) { throw new TikaConfigException( @@ -183,7 +197,7 @@ public class LibPstParser implements Parser, Initializable { } } - public static boolean checkQuietly() { + public boolean checkQuietly() { try { check(); } catch (TikaConfigException | IOException e) { @@ -192,6 +206,16 @@ public class LibPstParser implements Parser, Initializable { return true; } + private String getFullReadPstCommand() throws TikaConfigException { + if (StringUtils.isBlank(readPstPath)) { + return READ_PST_COMMAND; + } + if (! readPstPath.endsWith("/") && readPstPath.endsWith("\\")) { + return readPstPath + "/" + READ_PST_COMMAND; + } + return readPstPath + READ_PST_COMMAND; + } + @Field public void setTimeoutSeconds(long timeoutSeconds) { defaultConfig.setTimeoutSeconds(timeoutSeconds); @@ -212,5 +236,14 @@ public class LibPstParser implements Parser, Initializable { defaultConfig.setMaxEmails(maxEmails); } + /** + * This should include the path up to but not including 'readpst', e.g. "C:\my_bin" where + * readpst is at "C:\my_bin\readpst" + * @param readPstPath + */ + @Field + public void setReadPstPath(String readPstPath) { + this.readPstPath = readPstPath; + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java index c2573c8ed..3b157739a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java @@ -16,7 +16,9 @@ */ package org.apache.tika.parser.microsoft.libpst; -public class LibPstParserConfig { +import java.io.Serializable; + +public class LibPstParserConfig implements Serializable { private long timeoutSeconds = 600; /** diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java index 8e6863596..63b08cb29 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java @@ -40,7 +40,8 @@ public class TestLibPstParser extends TikaTest { @BeforeAll public static void setUp() { - LIBPST_EXISTS = LibPstParser.checkQuietly(); + //test that readpst is on the path + LIBPST_EXISTS = new LibPstParser().checkQuietly(); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java index d80908154..a1ce0f05b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java @@ -37,15 +37,18 @@ import org.xml.sax.SAXException; import org.apache.tika.MultiThreadedTikaTest; import org.apache.tika.Tika; +import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.fork.ForkParser; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.parser.microsoft.libpst.LibPstParser; import org.apache.tika.sax.BodyContentHandler; /** @@ -210,6 +213,23 @@ public class ForkParserIntegrationTest extends MultiThreadedTikaTest { } } + @Test + public void testLibPstParser() throws Exception { + if (! new LibPstParser().checkQuietly()) { + return; + } + TikaConfig tikaConfig = new TikaConfig( + ForkParserIntegrationTest.class.getResourceAsStream("/configs/tika-config-lib-pst.xml")); + try (ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), + new AutoDetectParser(tikaConfig))) { + ContentHandler output = new BodyContentHandler(); + InputStream stream = getResourceAsStream("/test-documents/testPST.pst"); + ParseContext context = new ParseContext(); + parser.parse(stream, output, new Metadata(), context); + assertContains("Barry Olddog", output.toString()); + } + } + @Test @Disabled("use for development/one off testing. This is a beast and takes enormous " + "resources and time") diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.xml new file mode 100644 index 000000000..df5a43127 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.xml @@ -0,0 +1,26 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"> + <parser-exclude class="org.apache.tika.parser.microsoft.pst.OutlookPSTParser"/> + <parser-exclude class="org.apache.tika.parser.microsoft.pst.PSTMailItemParser"/> + </parser> + <parser class="org.apache.tika.parser.microsoft.libpst.LibPstParser"/> + </parsers> +</properties>
