This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 92053ea71 TIKA-4355 -- LibPstParserConfig should be serializable
(#2060)
92053ea71 is described below
commit 92053ea71cc58a51ff2389f4fbaf92262c1fa088
Author: Tim Allison <[email protected]>
AuthorDate: Wed Dec 4 16:12:22 2024 -0500
TIKA-4355 -- LibPstParserConfig should be serializable (#2060)
* TIKA-4355 -- LibPstParserConfig should be serializable
* TIKA-4355 -- LibPstParser should allow the path to readpst to be
configurable
---
.../tika/parser/microsoft/libpst/LibPstParser.java | 47 ++++++++++++++++++----
.../microsoft/libpst/LibPstParserConfig.java | 4 +-
.../parser/microsoft/libpst/TestLibPstParser.java | 3 +-
.../parser/fork/ForkParserIntegrationTest.java | 20 +++++++++
.../test/resources/configs/tika-config-lib-pst.xml | 26 ++++++++++++
5 files changed, 91 insertions(+), 9 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java
index 6cf3e249c..b34a9e3e1 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java
@@ -20,6 +20,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@@ -47,6 +48,7 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.FileProcessResult;
import org.apache.tika.utils.ProcessUtils;
+import org.apache.tika.utils.StringUtils;
/**
* This is an optional PST parser that relies on the user installing
@@ -65,8 +67,10 @@ public class LibPstParser implements Parser, Initializable {
private static final int MAX_STDERR = 10000;
private static final String READ_PST_COMMAND = "readpst";
- private LibPstParserConfig defaultConfig = new LibPstParserConfig();
-
+ private final LibPstParserConfig defaultConfig = new LibPstParserConfig();
+ //for security purposes, this cannot be set via the parseContext. This must
+ //be set via the usual @Field setters in tika-config.xml
+ private String readPstPath = "";
@Override
public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
return SUPPORTED;
@@ -125,9 +129,10 @@ public class LibPstParser implements Parser, Initializable
{
Files.walkFileTree(outDir, new EmailVisitor(outDir,
config.isProcessEmailAsMsg(), xhtml, metadata, parseContext));
}
- private ProcessBuilder getProcessBuilder(Path pst, LibPstParserConfig
config, Path outDir, Path debugFile) {
+ private ProcessBuilder getProcessBuilder(Path pst, LibPstParserConfig
config, Path outDir, Path debugFile)
+ throws TikaConfigException {
List commands = new ArrayList<String>();
- commands.add(READ_PST_COMMAND);
+ commands.add(getFullReadPstCommand());
if (config.isDebug()) {
commands.add("-d");
commands.add(ProcessUtils.escapeCommandLine(debugFile
@@ -157,6 +162,13 @@ public class LibPstParser implements Parser, Initializable
{
@Override
public void initialize(Map<String, Param> map) throws TikaConfigException {
+ if (readPstPath.contains("\u0000")) {
+ throw new TikaConfigException("path can't include null values");
+ }
+ String fullReadPstCommand = getFullReadPstCommand();
+ if (! Files.isRegularFile(Paths.get(fullReadPstCommand))) {
+ throw new TikaConfigException("I regret I can't find the readpst
executable: " + fullReadPstCommand);
+ }
try {
check();
} catch (IOException e) {
@@ -171,8 +183,10 @@ public class LibPstParser implements Parser, Initializable
{
}
//throws exception if readpst is not available
- private static void check() throws TikaConfigException, IOException {
- ProcessBuilder pb = new ProcessBuilder(READ_PST_COMMAND, "-V");
+ private void check() throws TikaConfigException, IOException {
+ String fullReadPstCommand = getFullReadPstCommand();
+
+ ProcessBuilder pb = new
ProcessBuilder(ProcessUtils.escapeCommandLine(fullReadPstCommand), "-V");
FileProcessResult result = ProcessUtils.execute(pb, 30000, 10000,
10000);
if (result.getExitValue() != 0) {
throw new TikaConfigException(
@@ -183,7 +197,7 @@ public class LibPstParser implements Parser, Initializable {
}
}
- public static boolean checkQuietly() {
+ public boolean checkQuietly() {
try {
check();
} catch (TikaConfigException | IOException e) {
@@ -192,6 +206,16 @@ public class LibPstParser implements Parser, Initializable
{
return true;
}
+ private String getFullReadPstCommand() throws TikaConfigException {
+ if (StringUtils.isBlank(readPstPath)) {
+ return READ_PST_COMMAND;
+ }
+ if (! readPstPath.endsWith("/") && readPstPath.endsWith("\\")) {
+ return readPstPath + "/" + READ_PST_COMMAND;
+ }
+ return readPstPath + READ_PST_COMMAND;
+ }
+
@Field
public void setTimeoutSeconds(long timeoutSeconds) {
defaultConfig.setTimeoutSeconds(timeoutSeconds);
@@ -212,5 +236,14 @@ public class LibPstParser implements Parser, Initializable
{
defaultConfig.setMaxEmails(maxEmails);
}
+ /**
+ * This should include the path up to but not including 'readpst', e.g.
"C:\my_bin" where
+ * readpst is at "C:\my_bin\readpst"
+ * @param readPstPath
+ */
+ @Field
+ public void setReadPstPath(String readPstPath) {
+ this.readPstPath = readPstPath;
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java
index c2573c8ed..3b157739a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java
@@ -16,7 +16,9 @@
*/
package org.apache.tika.parser.microsoft.libpst;
-public class LibPstParserConfig {
+import java.io.Serializable;
+
+public class LibPstParserConfig implements Serializable {
private long timeoutSeconds = 600;
/**
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
index 73e623393..31cf69f8b 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
@@ -40,7 +40,8 @@ public class TestLibPstParser extends TikaTest {
@BeforeAll
public static void setUp() {
- LIBPST_EXISTS = LibPstParser.checkQuietly();
+ //test that readpst is on the path
+ LIBPST_EXISTS = new LibPstParser().checkQuietly();
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
index d80908154..a1ce0f05b 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
@@ -37,15 +37,18 @@ import org.xml.sax.SAXException;
import org.apache.tika.MultiThreadedTikaTest;
import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.fork.ForkParser;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.microsoft.libpst.LibPstParser;
import org.apache.tika.sax.BodyContentHandler;
/**
@@ -210,6 +213,23 @@ public class ForkParserIntegrationTest extends
MultiThreadedTikaTest {
}
}
+ @Test
+ public void testLibPstParser() throws Exception {
+ if (! new LibPstParser().checkQuietly()) {
+ return;
+ }
+ TikaConfig tikaConfig = new TikaConfig(
+
ForkParserIntegrationTest.class.getResourceAsStream("/configs/tika-config-lib-pst.xml"));
+ try (ForkParser parser = new
ForkParser(ForkParserIntegrationTest.class.getClassLoader(),
+ new AutoDetectParser(tikaConfig))) {
+ ContentHandler output = new BodyContentHandler();
+ InputStream stream =
getResourceAsStream("/test-documents/testPST.pst");
+ ParseContext context = new ParseContext();
+ parser.parse(stream, output, new Metadata(), context);
+ assertContains("Barry Olddog", output.toString());
+ }
+ }
+
@Test
@Disabled("use for development/one off testing. This is a beast and takes
enormous " +
"resources and time")
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.xml
new file mode 100644
index 000000000..df5a43127
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude
class="org.apache.tika.parser.microsoft.pst.OutlookPSTParser"/>
+ <parser-exclude
class="org.apache.tika.parser.microsoft.pst.PSTMailItemParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.libpst.LibPstParser"/>
+ </parsers>
+</properties>