This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4250
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e5eb4fb17d0cba562c7fede96ff955ac03c8cd51
Author: tallison <[email protected]>
AuthorDate: Thu May 9 10:04:36 2024 -0400

    TIKA-4250 -- add optional parser for pst files -- wrapper for libpst/readpst
---
 CHANGES.txt                                        |   4 +
 .../tika-parser-microsoft-module/pom.xml           |   7 +
 .../tika/parser/microsoft/libpst/EmailVisitor.java | 100 ++++++++++
 .../tika/parser/microsoft/libpst/LibPstParser.java | 216 +++++++++++++++++++++
 .../microsoft/libpst/LibPstParserConfig.java       |  81 ++++++++
 .../parser/microsoft/libpst/TestLibPstParser.java  | 116 +++++++++++
 .../parser/microsoft/libpst/tika-libpst-config.xml |  26 +++
 .../microsoft/libpst/tika-libpst-eml-config.xml    |  30 +++
 8 files changed, 580 insertions(+)

diff --git a/CHANGES.txt b/CHANGES.txt
index 3aa2c7b44..cc4575ff5 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -5,6 +5,10 @@ Release 3.0.0-BETA2 - ???
    * Updated PST parser to use standard Message metadata keys and improved
      handling of embedded files (TIKA-4248).
 
+   Other Changes
+
+   * Add optional PST parser based on libpst/readpst (TIKA-4250).
+
 Release 3.0.0-BETA - 12/01/2023
 
    BREAKING CHANGES
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
index ef285428a..e0cf5f435 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
@@ -116,6 +116,13 @@
       <artifactId>log4j-slf4j2-impl</artifactId>
       <scope>test</scope>
     </dependency>
+    <!-- needed for libpst test files -->
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-mail-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <build>
     <plugins>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java
new file mode 100644
index 000000000..644b2f046
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.libpst;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.FileVisitResult;
+import java.nio.file.FileVisitor;
+import java.nio.file.Path;
+import java.nio.file.attribute.BasicFileAttributes;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PST;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+public class EmailVisitor implements FileVisitor<Path> {
+
+    private final Path root;
+    private final boolean processEmailAsMsg;
+    private final XHTMLContentHandler xhtml;
+    private final Metadata parentMetadata;
+    private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
+
+    public EmailVisitor(Path root, boolean processEmailAsMsg, 
XHTMLContentHandler xhtml, Metadata parentMetadata, ParseContext parseContext) {
+        this.root = root;
+        this.processEmailAsMsg = processEmailAsMsg;
+        this.xhtml = xhtml;
+        this.parentMetadata = parentMetadata;
+        this.embeddedDocumentExtractor = 
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
+    }
+
+    @Override
+    public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes 
attrs) throws IOException {
+        return FileVisitResult.CONTINUE;
+    }
+
+    @Override
+    public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) 
throws IOException {
+        if (processEmailAsMsg) {
+            if (file
+                    .getFileName()
+                    .toString()
+                    .endsWith(".msg")) {
+                process(file);
+            }
+        } else if (file
+                .getFileName()
+                .toString()
+                .endsWith(".eml")) {
+            process(file);
+        }
+        return FileVisitResult.CONTINUE;
+    }
+
+    private void process(Path file) throws IOException {
+        Metadata emailMetadata = new Metadata();
+        String pstPath = root
+                .relativize(file.getParent())
+                .toString();
+        emailMetadata.set(PST.PST_FOLDER_PATH, pstPath);
+        try (InputStream is = TikaInputStream.get(file)) {
+            try {
+                embeddedDocumentExtractor.parseEmbedded(is, xhtml, 
emailMetadata, true);
+            } catch (SAXException e) {
+                throw new IOExceptionWithCause(e);
+            }
+        }
+    }
+
+    @Override
+    public FileVisitResult visitFileFailed(Path file, IOException exc) throws 
IOException {
+        return FileVisitResult.CONTINUE;
+    }
+
+    @Override
+    public FileVisitResult postVisitDirectory(Path dir, IOException exc) 
throws IOException {
+        return FileVisitResult.CONTINUE;
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java
new file mode 100644
index 000000000..6cf3e249c
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.libpst;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.FileProcessResult;
+import org.apache.tika.utils.ProcessUtils;
+
+/**
+ * This is an optional PST parser that relies on the user installing
+ * the GPL-3 libpst/readpst commandline tool and configuring
+ * Tika to call this library via tika-config.xml
+ */
+public class LibPstParser implements Parser, Initializable {
+
+    public static final MediaType MS_OUTLOOK_PST_MIMETYPE = 
MediaType.application("vnd.ms-outlook-pst");
+
+    private static final Set<MediaType> SUPPORTED = 
Set.of(MS_OUTLOOK_PST_MIMETYPE);
+
+    private static final Logger LOGGER = 
LoggerFactory.getLogger(LibPstParser.class);
+
+    private static final int MAX_STDOUT = 100000;
+    private static final int MAX_STDERR = 10000;
+    private static final String READ_PST_COMMAND = "readpst";
+
+    private LibPstParserConfig defaultConfig = new LibPstParserConfig();
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
+        return SUPPORTED;
+    }
+
+    @Override
+    public void parse(InputStream inputStream, ContentHandler contentHandler, 
Metadata metadata, ParseContext parseContext) throws IOException, SAXException, 
TikaException {
+        TikaInputStream tis = TikaInputStream.cast(inputStream);
+        TemporaryResources tmp = null;
+        if (tis == null) {
+            tmp = new TemporaryResources();
+            tis = TikaInputStream.get(inputStream, tmp, metadata);
+        }
+        try {
+            _parse(tis.getPath(), contentHandler, metadata, parseContext);
+        } finally {
+            IOUtils.closeQuietly(tmp);
+        }
+    }
+
+    private void _parse(Path pst, ContentHandler contentHandler, Metadata 
metadata, ParseContext parseContext) throws TikaException, IOException, 
SAXException {
+        LibPstParserConfig activeConfig = 
parseContext.get(LibPstParserConfig.class, defaultConfig);
+        Path outDir = Files.createTempDirectory("libpst-");
+        Path debugFile = activeConfig.isDebug() ? 
Files.createTempFile("tika-libpst-debug", ".txt") : null;
+        try {
+            ProcessBuilder pb = getProcessBuilder(pst, activeConfig, outDir, 
debugFile);
+            XHTMLContentHandler xhtml = new 
XHTMLContentHandler(contentHandler, metadata);
+            FileProcessResult fileProcessResult = ProcessUtils.execute(pb, 
activeConfig.getTimeoutSeconds() * 1000l, MAX_STDOUT, MAX_STDERR);
+            xhtml.startDocument();
+            processContents(outDir, activeConfig, xhtml, metadata, 
parseContext);
+            if (fileProcessResult.isTimeout()) {
+                throw new TikaException("Timeout exception: " + 
fileProcessResult.getProcessTimeMillis());
+            }
+            if (fileProcessResult.getExitValue() != 0) {
+                LOGGER.warn("libpst bad exit value {}: {}", 
fileProcessResult.getExitValue(), fileProcessResult.getStderr());
+                throw new TikaException("Bad exit value: " + 
fileProcessResult.getExitValue());
+            }
+            xhtml.endDocument();
+        } finally {
+            try {
+                FileUtils.deleteDirectory(outDir.toFile());
+            } catch (IOException e) {
+                LOGGER.warn("Couldn't delete temporary directory: " + 
outDir.toAbsolutePath(), e);
+            }
+            try {
+                if (debugFile != null) {
+                    Files.delete(debugFile);
+                }
+            } catch (IOException e) {
+                LOGGER.warn("Couldn't delete debug file?!", e);
+            }
+        }
+    }
+
+    private void processContents(Path outDir, LibPstParserConfig config, 
XHTMLContentHandler xhtml, Metadata metadata, ParseContext parseContext) throws 
IOException {
+        Files.walkFileTree(outDir, new EmailVisitor(outDir, 
config.isProcessEmailAsMsg(), xhtml, metadata, parseContext));
+    }
+
+    private ProcessBuilder getProcessBuilder(Path pst, LibPstParserConfig 
config, Path outDir, Path debugFile) {
+        List commands = new ArrayList<String>();
+        commands.add(READ_PST_COMMAND);
+        if (config.isDebug()) {
+            commands.add("-d");
+            commands.add(ProcessUtils.escapeCommandLine(debugFile
+                    .toAbsolutePath()
+                    .toString()));
+        }
+        if (config.isIncludeDeleted()) {
+            commands.add("-D");
+        }
+        if (config.isProcessEmailAsMsg()) {
+            commands.add("-m");
+        } else {
+            //include .eml and include extensions
+            commands.add("-e");
+        }
+        commands.add("-o");
+        commands.add(ProcessUtils.escapeCommandLine(outDir
+                .toAbsolutePath()
+                .toString()));
+
+        commands.add(ProcessUtils.escapeCommandLine(pst
+                .toAbsolutePath()
+                .toString()));
+        LOGGER.debug("command arguments: " + commands);
+        return new ProcessBuilder(commands);
+    }
+
+    @Override
+    public void initialize(Map<String, Param> map) throws TikaConfigException {
+        try {
+            check();
+        } catch (IOException e) {
+            LOGGER.error("Couldn't get version of libpst", e);
+            throw new TikaConfigException("Unable to check version of readpst. 
Is it installed?!", e);
+        }
+    }
+
+    @Override
+    public void checkInitialization(InitializableProblemHandler 
initializableProblemHandler) throws TikaConfigException {
+
+    }
+
+    //throws exception if readpst is not available
+    private static void check() throws TikaConfigException, IOException {
+        ProcessBuilder pb = new ProcessBuilder(READ_PST_COMMAND, "-V");
+        FileProcessResult result = ProcessUtils.execute(pb, 30000, 10000, 
10000);
+        if (result.getExitValue() != 0) {
+            throw new TikaConfigException(
+                    "bad exit value for LibPstParser. It must be installed and 
on the path" + " if this parser is configured. Exit value: " + 
result.getExitValue());
+        }
+        if (result.isTimeout()) {
+            throw new TikaConfigException("timeout trying to get version from 
readpst?!");
+        }
+    }
+
+    public static boolean checkQuietly() {
+        try {
+            check();
+        } catch (TikaConfigException | IOException e) {
+            return false;
+        }
+        return true;
+    }
+
+    @Field
+    public void setTimeoutSeconds(long timeoutSeconds) {
+        defaultConfig.setTimeoutSeconds(timeoutSeconds);
+    }
+
+    @Field
+    public void setProcessEmailAsMsg(boolean processEmailAsMsg) {
+        defaultConfig.setProcessEmailAsMsg(processEmailAsMsg);
+    }
+
+    @Field
+    public void setIncludeDeleted(boolean includeDeleted) {
+        defaultConfig.setIncludeDeleted(includeDeleted);
+    }
+
+    @Field
+    public void setMaxEmails(int maxEmails) {
+        defaultConfig.setMaxEmails(maxEmails);
+    }
+
+
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java
new file mode 100644
index 000000000..c2573c8ed
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.libpst;
+
+public class LibPstParserConfig {
+
+    private long timeoutSeconds = 600;
+    /**
+     * In initial tests, setting this to true resulted in more emails
+     * being extracted. It did dramatically slow down processing time. :(
+     */
+    private boolean isDebug = true;
+
+    /**
+     * Should readpst also output msg files for processing.
+     * In an initial test, not as many attachments were extracted from msg 
files.
+     * Not yet clear if that is a POI limitation or a problem with libpst
+     */
+    private boolean processEmailAsMsg = true;
+
+    private boolean includeDeleted = true;
+
+    /**
+     * max emails to process. Will process everything if this value is < 0
+     */
+    private int maxEmails = -1;
+
+    public long getTimeoutSeconds() {
+        return timeoutSeconds;
+    }
+
+    public void setTimeoutSeconds(long timeoutSeconds) {
+        this.timeoutSeconds = timeoutSeconds;
+    }
+
+    public boolean isDebug() {
+        return isDebug;
+    }
+
+    public void setDebug(boolean debug) {
+        isDebug = debug;
+    }
+
+    public boolean isProcessEmailAsMsg() {
+        return processEmailAsMsg;
+    }
+
+    public void setProcessEmailAsMsg(boolean processEmailAsMsg) {
+        this.processEmailAsMsg = processEmailAsMsg;
+    }
+
+    public boolean isIncludeDeleted() {
+        return includeDeleted;
+    }
+
+    public void setIncludeDeleted(boolean includeDeleted) {
+        this.includeDeleted = includeDeleted;
+    }
+
+    public int getMaxEmails() {
+        return maxEmails;
+    }
+
+    public void setMaxEmails(int maxEmails) {
+        this.maxEmails = maxEmails;
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
new file mode 100644
index 000000000..4bda2bbd4
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.libpst;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
+
+import java.util.List;
+
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Message;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.PST;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+
+public class TestLibPstParser extends TikaTest {
+
+    private static boolean LIBPST_EXISTS = false;
+
+    @BeforeAll
+    public static void setUp() {
+        LIBPST_EXISTS = LibPstParser.checkQuietly();
+    }
+
+    @Test
+    public void testBasic() throws Exception {
+        if (!LIBPST_EXISTS) {
+            return;
+        }
+        TikaConfig tikaConfig = new 
TikaConfig(TestLibPstParser.class.getResourceAsStream("tika-libpst-config.xml"));
+        Parser p = new AutoDetectParser(tikaConfig);
+        List<Metadata> metadataList = getRecursiveMetadata("testPST.pst", p);
+        //libpst is non-deterministic when creating msg files -- sometimes we 
get 7, sometimes 8
+        assumeTrue(metadataList.size() == 8);
+
+        Metadata m0 = metadataList.get(0);
+        assertEquals("org.apache.tika.parser.microsoft.libpst.LibPstParser", 
m0.getValues(TikaCoreProperties.TIKA_PARSED_BY)[1]);
+        int validPaths = 0;
+        for (int i = 1; i < metadataList.size(); i++) {
+            String path = metadataList
+                    .get(i)
+                    .get(PST.PST_FOLDER_PATH);
+            if (path != null) {
+                assertEquals("hong-thai.nguyen", path);
+                validPaths++;
+            }
+        }
+        //NOTE: this processing via lib pst misses an email (with an ooxml 
attachment) embedded inside an email
+        assertEquals(7, validPaths);
+
+        assertEquals("Hong-Thai Nguyen", metadataList
+                .get(1)
+                .get(Message.MESSAGE_TO_DISPLAY_NAME));
+        assertContains("See you there!", metadataList
+                .get(1)
+                .get(TikaCoreProperties.TIKA_CONTENT));
+
+        assertEquals("NOTE", metadataList
+                .get(7)
+                .get(Office.MAPI_MESSAGE_CLASS));
+    }
+
+    @Test
+    public void testEml() throws Exception {
+        if (!LIBPST_EXISTS) {
+            return;
+        }
+        TikaConfig tikaConfig = new 
TikaConfig(TestLibPstParser.class.getResourceAsStream("tika-libpst-eml-config.xml"));
+        Parser p = new AutoDetectParser(tikaConfig);
+
+        List<Metadata> metadataList = getRecursiveMetadata("testPST.pst", p);
+        assertEquals(10, metadataList.size());
+        Metadata m0 = metadataList.get(0);
+        assertEquals("org.apache.tika.parser.microsoft.libpst.LibPstParser", 
m0.getValues(TikaCoreProperties.TIKA_PARSED_BY)[1]);
+        int validPaths = 0;
+        for (int i = 1; i < metadataList.size(); i++) {
+            String path = metadataList
+                    .get(i)
+                    .get(PST.PST_FOLDER_PATH);
+            if (path != null) {
+                assertEquals("hong-thai.nguyen", path);
+                validPaths++;
+            }
+        }
+        assertEquals(7, validPaths);
+        assertContains("See you there!", metadataList
+                .get(3)
+                .get(TikaCoreProperties.TIKA_CONTENT));
+
+        
assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 metadataList
+                .get(4)
+                .get(Metadata.CONTENT_TYPE));
+    }
+
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-config.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-config.xml
new file mode 100644
index 000000000..c2e3b5150
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-config.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <parser-exclude 
class="org.apache.tika.parser.microsoft.pst.OutlookPSTParser"/>
+      <parser-exclude 
class="org.apache.tika.parser.microsoft.pst.PSTMailItemParser"/>
+    </parser>
+    <parser class="org.apache.tika.parser.microsoft.libpst.LibPstParser"/>
+  </parsers>
+</properties>
\ No newline at end of file
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-eml-config.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-eml-config.xml
new file mode 100644
index 000000000..7be83be4f
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-eml-config.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <parser-exclude 
class="org.apache.tika.parser.microsoft.pst.OutlookPSTParser"/>
+      <parser-exclude 
class="org.apache.tika.parser.microsoft.pst.PSTMailItemParser"/>
+    </parser>
+    <parser class="org.apache.tika.parser.microsoft.libpst.LibPstParser">
+      <params>
+        <param name="processEmailAsMsg" type="bool">false</param>
+      </params>
+    </parser>
+  </parsers>
+</properties>
\ No newline at end of file

Reply via email to