This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 32baf2345 TIKA-4250 -- add optional parser for pst files -- wrapper
for libpst/readpst (#1751)
32baf2345 is described below
commit 32baf2345abe1a04d767ea6641a567d5c924587e
Author: Tim Allison <[email protected]>
AuthorDate: Thu May 9 10:40:05 2024 -0400
TIKA-4250 -- add optional parser for pst files -- wrapper for
libpst/readpst (#1751)
---
CHANGES.txt | 4 +
.../tika-parser-microsoft-module/pom.xml | 7 +
.../tika/parser/microsoft/libpst/EmailVisitor.java | 100 ++++++++++
.../tika/parser/microsoft/libpst/LibPstParser.java | 216 +++++++++++++++++++++
.../microsoft/libpst/LibPstParserConfig.java | 81 ++++++++
.../parser/microsoft/libpst/TestLibPstParser.java | 116 +++++++++++
.../parser/microsoft/libpst/tika-libpst-config.xml | 26 +++
.../microsoft/libpst/tika-libpst-eml-config.xml | 30 +++
8 files changed, 580 insertions(+)
diff --git a/CHANGES.txt b/CHANGES.txt
index 3aa2c7b44..cc4575ff5 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -5,6 +5,10 @@ Release 3.0.0-BETA2 - ???
* Updated PST parser to use standard Message metadata keys and improved
handling of embedded files (TIKA-4248).
+ Other Changes
+
+ * Add optional PST parser based on libpst/readpst (TIKA-4250).
+
Release 3.0.0-BETA - 12/01/2023
BREAKING CHANGES
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
index ef285428a..e0cf5f435 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
@@ -116,6 +116,13 @@
<artifactId>log4j-slf4j2-impl</artifactId>
<scope>test</scope>
</dependency>
+ <!-- needed for libpst test files -->
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-mail-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
<plugins>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java
new file mode 100644
index 000000000..644b2f046
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.libpst;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.FileVisitResult;
+import java.nio.file.FileVisitor;
+import java.nio.file.Path;
+import java.nio.file.attribute.BasicFileAttributes;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PST;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+public class EmailVisitor implements FileVisitor<Path> {
+
+ private final Path root;
+ private final boolean processEmailAsMsg;
+ private final XHTMLContentHandler xhtml;
+ private final Metadata parentMetadata;
+ private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
+
+ public EmailVisitor(Path root, boolean processEmailAsMsg,
XHTMLContentHandler xhtml, Metadata parentMetadata, ParseContext parseContext) {
+ this.root = root;
+ this.processEmailAsMsg = processEmailAsMsg;
+ this.xhtml = xhtml;
+ this.parentMetadata = parentMetadata;
+ this.embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
+ }
+
+ @Override
+ public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes
attrs) throws IOException {
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs)
throws IOException {
+ if (processEmailAsMsg) {
+ if (file
+ .getFileName()
+ .toString()
+ .endsWith(".msg")) {
+ process(file);
+ }
+ } else if (file
+ .getFileName()
+ .toString()
+ .endsWith(".eml")) {
+ process(file);
+ }
+ return FileVisitResult.CONTINUE;
+ }
+
+ private void process(Path file) throws IOException {
+ Metadata emailMetadata = new Metadata();
+ String pstPath = root
+ .relativize(file.getParent())
+ .toString();
+ emailMetadata.set(PST.PST_FOLDER_PATH, pstPath);
+ try (InputStream is = TikaInputStream.get(file)) {
+ try {
+ embeddedDocumentExtractor.parseEmbedded(is, xhtml,
emailMetadata, true);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(e);
+ }
+ }
+ }
+
+ @Override
+ public FileVisitResult visitFileFailed(Path file, IOException exc) throws
IOException {
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult postVisitDirectory(Path dir, IOException exc)
throws IOException {
+ return FileVisitResult.CONTINUE;
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java
new file mode 100644
index 000000000..6cf3e249c
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.libpst;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.FileProcessResult;
+import org.apache.tika.utils.ProcessUtils;
+
+/**
+ * This is an optional PST parser that relies on the user installing
+ * the GPL-3 libpst/readpst commandline tool and configuring
+ * Tika to call this library via tika-config.xml
+ */
+public class LibPstParser implements Parser, Initializable {
+
+ public static final MediaType MS_OUTLOOK_PST_MIMETYPE =
MediaType.application("vnd.ms-outlook-pst");
+
+ private static final Set<MediaType> SUPPORTED =
Set.of(MS_OUTLOOK_PST_MIMETYPE);
+
+ private static final Logger LOGGER =
LoggerFactory.getLogger(LibPstParser.class);
+
+ private static final int MAX_STDOUT = 100000;
+ private static final int MAX_STDERR = 10000;
+ private static final String READ_PST_COMMAND = "readpst";
+
+ private LibPstParserConfig defaultConfig = new LibPstParserConfig();
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
+ return SUPPORTED;
+ }
+
+ @Override
+ public void parse(InputStream inputStream, ContentHandler contentHandler,
Metadata metadata, ParseContext parseContext) throws IOException, SAXException,
TikaException {
+ TikaInputStream tis = TikaInputStream.cast(inputStream);
+ TemporaryResources tmp = null;
+ if (tis == null) {
+ tmp = new TemporaryResources();
+ tis = TikaInputStream.get(inputStream, tmp, metadata);
+ }
+ try {
+ _parse(tis.getPath(), contentHandler, metadata, parseContext);
+ } finally {
+ IOUtils.closeQuietly(tmp);
+ }
+ }
+
+ private void _parse(Path pst, ContentHandler contentHandler, Metadata
metadata, ParseContext parseContext) throws TikaException, IOException,
SAXException {
+ LibPstParserConfig activeConfig =
parseContext.get(LibPstParserConfig.class, defaultConfig);
+ Path outDir = Files.createTempDirectory("libpst-");
+ Path debugFile = activeConfig.isDebug() ?
Files.createTempFile("tika-libpst-debug", ".txt") : null;
+ try {
+ ProcessBuilder pb = getProcessBuilder(pst, activeConfig, outDir,
debugFile);
+ XHTMLContentHandler xhtml = new
XHTMLContentHandler(contentHandler, metadata);
+ FileProcessResult fileProcessResult = ProcessUtils.execute(pb,
activeConfig.getTimeoutSeconds() * 1000l, MAX_STDOUT, MAX_STDERR);
+ xhtml.startDocument();
+ processContents(outDir, activeConfig, xhtml, metadata,
parseContext);
+ if (fileProcessResult.isTimeout()) {
+ throw new TikaException("Timeout exception: " +
fileProcessResult.getProcessTimeMillis());
+ }
+ if (fileProcessResult.getExitValue() != 0) {
+ LOGGER.warn("libpst bad exit value {}: {}",
fileProcessResult.getExitValue(), fileProcessResult.getStderr());
+ throw new TikaException("Bad exit value: " +
fileProcessResult.getExitValue());
+ }
+ xhtml.endDocument();
+ } finally {
+ try {
+ FileUtils.deleteDirectory(outDir.toFile());
+ } catch (IOException e) {
+ LOGGER.warn("Couldn't delete temporary directory: " +
outDir.toAbsolutePath(), e);
+ }
+ try {
+ if (debugFile != null) {
+ Files.delete(debugFile);
+ }
+ } catch (IOException e) {
+ LOGGER.warn("Couldn't delete debug file?!", e);
+ }
+ }
+ }
+
+ private void processContents(Path outDir, LibPstParserConfig config,
XHTMLContentHandler xhtml, Metadata metadata, ParseContext parseContext) throws
IOException {
+ Files.walkFileTree(outDir, new EmailVisitor(outDir,
config.isProcessEmailAsMsg(), xhtml, metadata, parseContext));
+ }
+
+ private ProcessBuilder getProcessBuilder(Path pst, LibPstParserConfig
config, Path outDir, Path debugFile) {
+ List commands = new ArrayList<String>();
+ commands.add(READ_PST_COMMAND);
+ if (config.isDebug()) {
+ commands.add("-d");
+ commands.add(ProcessUtils.escapeCommandLine(debugFile
+ .toAbsolutePath()
+ .toString()));
+ }
+ if (config.isIncludeDeleted()) {
+ commands.add("-D");
+ }
+ if (config.isProcessEmailAsMsg()) {
+ commands.add("-m");
+ } else {
+ //include .eml and include extensions
+ commands.add("-e");
+ }
+ commands.add("-o");
+ commands.add(ProcessUtils.escapeCommandLine(outDir
+ .toAbsolutePath()
+ .toString()));
+
+ commands.add(ProcessUtils.escapeCommandLine(pst
+ .toAbsolutePath()
+ .toString()));
+ LOGGER.debug("command arguments: " + commands);
+ return new ProcessBuilder(commands);
+ }
+
+ @Override
+ public void initialize(Map<String, Param> map) throws TikaConfigException {
+ try {
+ check();
+ } catch (IOException e) {
+ LOGGER.error("Couldn't get version of libpst", e);
+ throw new TikaConfigException("Unable to check version of readpst.
Is it installed?!", e);
+ }
+ }
+
+ @Override
+ public void checkInitialization(InitializableProblemHandler
initializableProblemHandler) throws TikaConfigException {
+
+ }
+
+ //throws exception if readpst is not available
+ private static void check() throws TikaConfigException, IOException {
+ ProcessBuilder pb = new ProcessBuilder(READ_PST_COMMAND, "-V");
+ FileProcessResult result = ProcessUtils.execute(pb, 30000, 10000,
10000);
+ if (result.getExitValue() != 0) {
+ throw new TikaConfigException(
+ "bad exit value for LibPstParser. It must be installed and
on the path" + " if this parser is configured. Exit value: " +
result.getExitValue());
+ }
+ if (result.isTimeout()) {
+ throw new TikaConfigException("timeout trying to get version from
readpst?!");
+ }
+ }
+
+ public static boolean checkQuietly() {
+ try {
+ check();
+ } catch (TikaConfigException | IOException e) {
+ return false;
+ }
+ return true;
+ }
+
+ @Field
+ public void setTimeoutSeconds(long timeoutSeconds) {
+ defaultConfig.setTimeoutSeconds(timeoutSeconds);
+ }
+
+ @Field
+ public void setProcessEmailAsMsg(boolean processEmailAsMsg) {
+ defaultConfig.setProcessEmailAsMsg(processEmailAsMsg);
+ }
+
+ @Field
+ public void setIncludeDeleted(boolean includeDeleted) {
+ defaultConfig.setIncludeDeleted(includeDeleted);
+ }
+
+ @Field
+ public void setMaxEmails(int maxEmails) {
+ defaultConfig.setMaxEmails(maxEmails);
+ }
+
+
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java
new file mode 100644
index 000000000..c2573c8ed
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.libpst;
+
+public class LibPstParserConfig {
+
+ private long timeoutSeconds = 600;
+ /**
+ * In initial tests, setting this to true resulted in more emails
+ * being extracted. It did dramatically slow down processing time. :(
+ */
+ private boolean isDebug = true;
+
+ /**
+ * Should readpst also output msg files for processing.
+ * In an initial test, not as many attachments were extracted from msg
files.
+ * Not yet clear if that is a POI limitation or a problem with libpst
+ */
+ private boolean processEmailAsMsg = true;
+
+ private boolean includeDeleted = true;
+
+ /**
+ * max emails to process. Will process everything if this value is < 0
+ */
+ private int maxEmails = -1;
+
+ public long getTimeoutSeconds() {
+ return timeoutSeconds;
+ }
+
+ public void setTimeoutSeconds(long timeoutSeconds) {
+ this.timeoutSeconds = timeoutSeconds;
+ }
+
+ public boolean isDebug() {
+ return isDebug;
+ }
+
+ public void setDebug(boolean debug) {
+ isDebug = debug;
+ }
+
+ public boolean isProcessEmailAsMsg() {
+ return processEmailAsMsg;
+ }
+
+ public void setProcessEmailAsMsg(boolean processEmailAsMsg) {
+ this.processEmailAsMsg = processEmailAsMsg;
+ }
+
+ public boolean isIncludeDeleted() {
+ return includeDeleted;
+ }
+
+ public void setIncludeDeleted(boolean includeDeleted) {
+ this.includeDeleted = includeDeleted;
+ }
+
+ public int getMaxEmails() {
+ return maxEmails;
+ }
+
+ public void setMaxEmails(int maxEmails) {
+ this.maxEmails = maxEmails;
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
new file mode 100644
index 000000000..4bda2bbd4
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.libpst;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
+
+import java.util.List;
+
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Message;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.PST;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+
+public class TestLibPstParser extends TikaTest {
+
+ private static boolean LIBPST_EXISTS = false;
+
+ @BeforeAll
+ public static void setUp() {
+ LIBPST_EXISTS = LibPstParser.checkQuietly();
+ }
+
+ @Test
+ public void testBasic() throws Exception {
+ if (!LIBPST_EXISTS) {
+ return;
+ }
+ TikaConfig tikaConfig = new
TikaConfig(TestLibPstParser.class.getResourceAsStream("tika-libpst-config.xml"));
+ Parser p = new AutoDetectParser(tikaConfig);
+ List<Metadata> metadataList = getRecursiveMetadata("testPST.pst", p);
+ //libpst is non-deterministic when creating msg files -- sometimes we
get 7, sometimes 8
+ assumeTrue(metadataList.size() == 8);
+
+ Metadata m0 = metadataList.get(0);
+ assertEquals("org.apache.tika.parser.microsoft.libpst.LibPstParser",
m0.getValues(TikaCoreProperties.TIKA_PARSED_BY)[1]);
+ int validPaths = 0;
+ for (int i = 1; i < metadataList.size(); i++) {
+ String path = metadataList
+ .get(i)
+ .get(PST.PST_FOLDER_PATH);
+ if (path != null) {
+ assertEquals("hong-thai.nguyen", path);
+ validPaths++;
+ }
+ }
+ //NOTE: this processing via lib pst misses an email (with an ooxml
attachment) embedded inside an email
+ assertEquals(7, validPaths);
+
+ assertEquals("Hong-Thai Nguyen", metadataList
+ .get(1)
+ .get(Message.MESSAGE_TO_DISPLAY_NAME));
+ assertContains("See you there!", metadataList
+ .get(1)
+ .get(TikaCoreProperties.TIKA_CONTENT));
+
+ assertEquals("NOTE", metadataList
+ .get(7)
+ .get(Office.MAPI_MESSAGE_CLASS));
+ }
+
+ @Test
+ public void testEml() throws Exception {
+ if (!LIBPST_EXISTS) {
+ return;
+ }
+ TikaConfig tikaConfig = new
TikaConfig(TestLibPstParser.class.getResourceAsStream("tika-libpst-eml-config.xml"));
+ Parser p = new AutoDetectParser(tikaConfig);
+
+ List<Metadata> metadataList = getRecursiveMetadata("testPST.pst", p);
+ assertEquals(10, metadataList.size());
+ Metadata m0 = metadataList.get(0);
+ assertEquals("org.apache.tika.parser.microsoft.libpst.LibPstParser",
m0.getValues(TikaCoreProperties.TIKA_PARSED_BY)[1]);
+ int validPaths = 0;
+ for (int i = 1; i < metadataList.size(); i++) {
+ String path = metadataList
+ .get(i)
+ .get(PST.PST_FOLDER_PATH);
+ if (path != null) {
+ assertEquals("hong-thai.nguyen", path);
+ validPaths++;
+ }
+ }
+ assertEquals(7, validPaths);
+ assertContains("See you there!", metadataList
+ .get(3)
+ .get(TikaCoreProperties.TIKA_CONTENT));
+
+
assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
metadataList
+ .get(4)
+ .get(Metadata.CONTENT_TYPE));
+ }
+
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-config.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-config.xml
new file mode 100644
index 000000000..c2e3b5150
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-config.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude
class="org.apache.tika.parser.microsoft.pst.OutlookPSTParser"/>
+ <parser-exclude
class="org.apache.tika.parser.microsoft.pst.PSTMailItemParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.libpst.LibPstParser"/>
+ </parsers>
+</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-eml-config.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-eml-config.xml
new file mode 100644
index 000000000..7be83be4f
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-eml-config.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude
class="org.apache.tika.parser.microsoft.pst.OutlookPSTParser"/>
+ <parser-exclude
class="org.apache.tika.parser.microsoft.pst.PSTMailItemParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.libpst.LibPstParser">
+ <params>
+ <param name="processEmailAsMsg" type="bool">false</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
\ No newline at end of file