This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit f40dda9d5a7c933ac8c79ff32d73de98ae428614
Author: Tim Allison <[email protected]>
AuthorDate: Wed Dec 4 16:12:22 2024 -0500

    TIKA-4355 -- LibPstParserConfig should be serializable (#2060)
    
    * TIKA-4355 -- LibPstParserConfig should be serializable
    
    * TIKA-4355 -- LibPstParser should allow the path to readpst to be 
configurable
    
    (cherry picked from commit 92053ea71cc58a51ff2389f4fbaf92262c1fa088)
---
 .../tika/parser/microsoft/libpst/LibPstParser.java | 47 ++++++++++++++++++----
 .../microsoft/libpst/LibPstParserConfig.java       |  4 +-
 .../parser/microsoft/libpst/TestLibPstParser.java  |  3 +-
 .../parser/fork/ForkParserIntegrationTest.java     | 20 +++++++++
 .../test/resources/configs/tika-config-lib-pst.xml | 26 ++++++++++++
 5 files changed, 91 insertions(+), 9 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java
index 6cf3e249c..b34a9e3e1 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java
@@ -20,6 +20,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
@@ -47,6 +48,7 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.FileProcessResult;
 import org.apache.tika.utils.ProcessUtils;
+import org.apache.tika.utils.StringUtils;
 
 /**
  * This is an optional PST parser that relies on the user installing
@@ -65,8 +67,10 @@ public class LibPstParser implements Parser, Initializable {
     private static final int MAX_STDERR = 10000;
     private static final String READ_PST_COMMAND = "readpst";
 
-    private LibPstParserConfig defaultConfig = new LibPstParserConfig();
-
+    private final LibPstParserConfig defaultConfig = new LibPstParserConfig();
+    //for security purposes, this cannot be set via the parseContext. This must
+    //be set via the usual @Field setters in tika-config.xml
+    private String readPstPath = "";
     @Override
     public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
         return SUPPORTED;
@@ -125,9 +129,10 @@ public class LibPstParser implements Parser, Initializable 
{
         Files.walkFileTree(outDir, new EmailVisitor(outDir, 
config.isProcessEmailAsMsg(), xhtml, metadata, parseContext));
     }
 
-    private ProcessBuilder getProcessBuilder(Path pst, LibPstParserConfig 
config, Path outDir, Path debugFile) {
+    private ProcessBuilder getProcessBuilder(Path pst, LibPstParserConfig 
config, Path outDir, Path debugFile)
+            throws TikaConfigException {
         List commands = new ArrayList<String>();
-        commands.add(READ_PST_COMMAND);
+        commands.add(getFullReadPstCommand());
         if (config.isDebug()) {
             commands.add("-d");
             commands.add(ProcessUtils.escapeCommandLine(debugFile
@@ -157,6 +162,13 @@ public class LibPstParser implements Parser, Initializable 
{
 
     @Override
     public void initialize(Map<String, Param> map) throws TikaConfigException {
+        if (readPstPath.contains("\u0000")) {
+            throw new TikaConfigException("path can't include null values");
+        }
+        String fullReadPstCommand = getFullReadPstCommand();
+        if (! Files.isRegularFile(Paths.get(fullReadPstCommand))) {
+            throw new TikaConfigException("I regret I can't find the readpst 
executable: " + fullReadPstCommand);
+        }
         try {
             check();
         } catch (IOException e) {
@@ -171,8 +183,10 @@ public class LibPstParser implements Parser, Initializable 
{
     }
 
     //throws exception if readpst is not available
-    private static void check() throws TikaConfigException, IOException {
-        ProcessBuilder pb = new ProcessBuilder(READ_PST_COMMAND, "-V");
+    private void check() throws TikaConfigException, IOException {
+        String fullReadPstCommand = getFullReadPstCommand();
+
+        ProcessBuilder pb = new 
ProcessBuilder(ProcessUtils.escapeCommandLine(fullReadPstCommand), "-V");
         FileProcessResult result = ProcessUtils.execute(pb, 30000, 10000, 
10000);
         if (result.getExitValue() != 0) {
             throw new TikaConfigException(
@@ -183,7 +197,7 @@ public class LibPstParser implements Parser, Initializable {
         }
     }
 
-    public static boolean checkQuietly() {
+    public boolean checkQuietly() {
         try {
             check();
         } catch (TikaConfigException | IOException e) {
@@ -192,6 +206,16 @@ public class LibPstParser implements Parser, Initializable 
{
         return true;
     }
 
+    private String getFullReadPstCommand() throws TikaConfigException {
+        if (StringUtils.isBlank(readPstPath)) {
+            return READ_PST_COMMAND;
+        }
+        if (! readPstPath.endsWith("/") && readPstPath.endsWith("\\")) {
+            return readPstPath + "/" + READ_PST_COMMAND;
+        }
+        return readPstPath + READ_PST_COMMAND;
+    }
+
     @Field
     public void setTimeoutSeconds(long timeoutSeconds) {
         defaultConfig.setTimeoutSeconds(timeoutSeconds);
@@ -212,5 +236,14 @@ public class LibPstParser implements Parser, Initializable 
{
         defaultConfig.setMaxEmails(maxEmails);
     }
 
+    /**
+     * This should include the path up to but not including 'readpst', e.g. 
"C:\my_bin" where
+     * readpst is at "C:\my_bin\readpst"
+     * @param readPstPath
+     */
+    @Field
+    public void setReadPstPath(String readPstPath) {
+        this.readPstPath = readPstPath;
+    }
 
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java
index c2573c8ed..3b157739a 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java
@@ -16,7 +16,9 @@
  */
 package org.apache.tika.parser.microsoft.libpst;
 
-public class LibPstParserConfig {
+import java.io.Serializable;
+
+public class LibPstParserConfig implements Serializable {
 
     private long timeoutSeconds = 600;
     /**
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
index 8e6863596..63b08cb29 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
@@ -40,7 +40,8 @@ public class TestLibPstParser extends TikaTest {
 
     @BeforeAll
     public static void setUp() {
-        LIBPST_EXISTS = LibPstParser.checkQuietly();
+        //test that readpst is on the path
+        LIBPST_EXISTS = new LibPstParser().checkQuietly();
     }
 
     @Test
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
index d80908154..a1ce0f05b 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
@@ -37,15 +37,18 @@ import org.xml.sax.SAXException;
 
 import org.apache.tika.MultiThreadedTikaTest;
 import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.fork.ForkParser;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.microsoft.libpst.LibPstParser;
 import org.apache.tika.sax.BodyContentHandler;
 
 /**
@@ -210,6 +213,23 @@ public class ForkParserIntegrationTest extends 
MultiThreadedTikaTest {
         }
     }
 
+    @Test
+    public void testLibPstParser() throws Exception {
+        if (! new LibPstParser().checkQuietly()) {
+            return;
+        }
+        TikaConfig tikaConfig = new TikaConfig(
+                
ForkParserIntegrationTest.class.getResourceAsStream("/configs/tika-config-lib-pst.xml"));
+        try (ForkParser parser = new 
ForkParser(ForkParserIntegrationTest.class.getClassLoader(),
+                new AutoDetectParser(tikaConfig))) {
+            ContentHandler output = new BodyContentHandler();
+            InputStream stream = 
getResourceAsStream("/test-documents/testPST.pst");
+            ParseContext context = new ParseContext();
+            parser.parse(stream, output, new Metadata(), context);
+            assertContains("Barry Olddog", output.toString());
+        }
+    }
+
     @Test
     @Disabled("use for development/one off testing.  This is a beast and takes 
enormous " +
             "resources and time")
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.xml
new file mode 100644
index 000000000..df5a43127
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <parser-exclude 
class="org.apache.tika.parser.microsoft.pst.OutlookPSTParser"/>
+      <parser-exclude 
class="org.apache.tika.parser.microsoft.pst.PSTMailItemParser"/>
+    </parser>
+    <parser class="org.apache.tika.parser.microsoft.libpst.LibPstParser"/>
+  </parsers>
+</properties>

Reply via email to