This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4250 in repository https://gitbox.apache.org/repos/asf/tika.git
commit e5eb4fb17d0cba562c7fede96ff955ac03c8cd51 Author: tallison <[email protected]> AuthorDate: Thu May 9 10:04:36 2024 -0400 TIKA-4250 -- add optional parser for pst files -- wrapper for libpst/readpst --- CHANGES.txt | 4 + .../tika-parser-microsoft-module/pom.xml | 7 + .../tika/parser/microsoft/libpst/EmailVisitor.java | 100 ++++++++++ .../tika/parser/microsoft/libpst/LibPstParser.java | 216 +++++++++++++++++++++ .../microsoft/libpst/LibPstParserConfig.java | 81 ++++++++ .../parser/microsoft/libpst/TestLibPstParser.java | 116 +++++++++++ .../parser/microsoft/libpst/tika-libpst-config.xml | 26 +++ .../microsoft/libpst/tika-libpst-eml-config.xml | 30 +++ 8 files changed, 580 insertions(+) diff --git a/CHANGES.txt b/CHANGES.txt index 3aa2c7b44..cc4575ff5 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -5,6 +5,10 @@ Release 3.0.0-BETA2 - ??? * Updated PST parser to use standard Message metadata keys and improved handling of embedded files (TIKA-4248). + Other Changes + + * Add optional PST parser based on libpst/readpst (TIKA-4250). + Release 3.0.0-BETA - 12/01/2023 BREAKING CHANGES diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml index ef285428a..e0cf5f435 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml @@ -116,6 +116,13 @@ <artifactId>log4j-slf4j2-impl</artifactId> <scope>test</scope> </dependency> + <!-- needed for libpst test files --> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-parser-mail-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> </dependencies> <build> <plugins> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java new file mode 100644 index 000000000..644b2f046 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.libpst; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.FileVisitResult; +import java.nio.file.FileVisitor; +import java.nio.file.Path; +import java.nio.file.attribute.BasicFileAttributes; + +import org.apache.commons.io.IOExceptionWithCause; +import org.xml.sax.SAXException; + +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.PST; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; + +public class EmailVisitor implements FileVisitor<Path> { + + private final Path root; + private final boolean processEmailAsMsg; + private final XHTMLContentHandler xhtml; + private final Metadata parentMetadata; + private final EmbeddedDocumentExtractor embeddedDocumentExtractor; + + public EmailVisitor(Path root, boolean processEmailAsMsg, XHTMLContentHandler xhtml, Metadata parentMetadata, ParseContext parseContext) { + this.root = root; + this.processEmailAsMsg = processEmailAsMsg; + this.xhtml = xhtml; + this.parentMetadata = parentMetadata; + this.embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext); + } + + @Override + public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + if (processEmailAsMsg) { + if (file + .getFileName() + .toString() + .endsWith(".msg")) { + process(file); + } + } else if (file + .getFileName() + .toString() + .endsWith(".eml")) { + process(file); + } + return FileVisitResult.CONTINUE; + } + + private void process(Path file) throws IOException { + Metadata emailMetadata = new Metadata(); + String pstPath = root + .relativize(file.getParent()) + .toString(); + emailMetadata.set(PST.PST_FOLDER_PATH, pstPath); + try (InputStream is = TikaInputStream.get(file)) { + try { + embeddedDocumentExtractor.parseEmbedded(is, xhtml, emailMetadata, true); + } catch (SAXException e) { + throw new IOExceptionWithCause(e); + } + } + } + + @Override + public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException { + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { + return FileVisitResult.CONTINUE; + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java new file mode 100644 index 000000000..6cf3e249c --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParser.java @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.libpst; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.config.Field; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; +import org.apache.tika.config.Param; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.FileProcessResult; +import org.apache.tika.utils.ProcessUtils; + +/** + * This is an optional PST parser that relies on the user installing + * the GPL-3 libpst/readpst commandline tool and configuring + * Tika to call this library via tika-config.xml + */ +public class LibPstParser implements Parser, Initializable { + + public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst"); + + private static final Set<MediaType> SUPPORTED = Set.of(MS_OUTLOOK_PST_MIMETYPE); + + private static final Logger LOGGER = LoggerFactory.getLogger(LibPstParser.class); + + private static final int MAX_STDOUT = 100000; + private static final int MAX_STDERR = 10000; + private static final String READ_PST_COMMAND = "readpst"; + + private LibPstParserConfig defaultConfig = new LibPstParserConfig(); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext parseContext) { + return SUPPORTED; + } + + @Override + public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException { + TikaInputStream tis = TikaInputStream.cast(inputStream); + TemporaryResources tmp = null; + if (tis == null) { + tmp = new TemporaryResources(); + tis = TikaInputStream.get(inputStream, tmp, metadata); + } + try { + _parse(tis.getPath(), contentHandler, metadata, parseContext); + } finally { + IOUtils.closeQuietly(tmp); + } + } + + private void _parse(Path pst, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws TikaException, IOException, SAXException { + LibPstParserConfig activeConfig = parseContext.get(LibPstParserConfig.class, defaultConfig); + Path outDir = Files.createTempDirectory("libpst-"); + Path debugFile = activeConfig.isDebug() ? Files.createTempFile("tika-libpst-debug", ".txt") : null; + try { + ProcessBuilder pb = getProcessBuilder(pst, activeConfig, outDir, debugFile); + XHTMLContentHandler xhtml = new XHTMLContentHandler(contentHandler, metadata); + FileProcessResult fileProcessResult = ProcessUtils.execute(pb, activeConfig.getTimeoutSeconds() * 1000l, MAX_STDOUT, MAX_STDERR); + xhtml.startDocument(); + processContents(outDir, activeConfig, xhtml, metadata, parseContext); + if (fileProcessResult.isTimeout()) { + throw new TikaException("Timeout exception: " + fileProcessResult.getProcessTimeMillis()); + } + if (fileProcessResult.getExitValue() != 0) { + LOGGER.warn("libpst bad exit value {}: {}", fileProcessResult.getExitValue(), fileProcessResult.getStderr()); + throw new TikaException("Bad exit value: " + fileProcessResult.getExitValue()); + } + xhtml.endDocument(); + } finally { + try { + FileUtils.deleteDirectory(outDir.toFile()); + } catch (IOException e) { + LOGGER.warn("Couldn't delete temporary directory: " + outDir.toAbsolutePath(), e); + } + try { + if (debugFile != null) { + Files.delete(debugFile); + } + } catch (IOException e) { + LOGGER.warn("Couldn't delete debug file?!", e); + } + } + } + + private void processContents(Path outDir, LibPstParserConfig config, XHTMLContentHandler xhtml, Metadata metadata, ParseContext parseContext) throws IOException { + Files.walkFileTree(outDir, new EmailVisitor(outDir, config.isProcessEmailAsMsg(), xhtml, metadata, parseContext)); + } + + private ProcessBuilder getProcessBuilder(Path pst, LibPstParserConfig config, Path outDir, Path debugFile) { + List commands = new ArrayList<String>(); + commands.add(READ_PST_COMMAND); + if (config.isDebug()) { + commands.add("-d"); + commands.add(ProcessUtils.escapeCommandLine(debugFile + .toAbsolutePath() + .toString())); + } + if (config.isIncludeDeleted()) { + commands.add("-D"); + } + if (config.isProcessEmailAsMsg()) { + commands.add("-m"); + } else { + //include .eml and include extensions + commands.add("-e"); + } + commands.add("-o"); + commands.add(ProcessUtils.escapeCommandLine(outDir + .toAbsolutePath() + .toString())); + + commands.add(ProcessUtils.escapeCommandLine(pst + .toAbsolutePath() + .toString())); + LOGGER.debug("command arguments: " + commands); + return new ProcessBuilder(commands); + } + + @Override + public void initialize(Map<String, Param> map) throws TikaConfigException { + try { + check(); + } catch (IOException e) { + LOGGER.error("Couldn't get version of libpst", e); + throw new TikaConfigException("Unable to check version of readpst. Is it installed?!", e); + } + } + + @Override + public void checkInitialization(InitializableProblemHandler initializableProblemHandler) throws TikaConfigException { + + } + + //throws exception if readpst is not available + private static void check() throws TikaConfigException, IOException { + ProcessBuilder pb = new ProcessBuilder(READ_PST_COMMAND, "-V"); + FileProcessResult result = ProcessUtils.execute(pb, 30000, 10000, 10000); + if (result.getExitValue() != 0) { + throw new TikaConfigException( + "bad exit value for LibPstParser. It must be installed and on the path" + " if this parser is configured. Exit value: " + result.getExitValue()); + } + if (result.isTimeout()) { + throw new TikaConfigException("timeout trying to get version from readpst?!"); + } + } + + public static boolean checkQuietly() { + try { + check(); + } catch (TikaConfigException | IOException e) { + return false; + } + return true; + } + + @Field + public void setTimeoutSeconds(long timeoutSeconds) { + defaultConfig.setTimeoutSeconds(timeoutSeconds); + } + + @Field + public void setProcessEmailAsMsg(boolean processEmailAsMsg) { + defaultConfig.setProcessEmailAsMsg(processEmailAsMsg); + } + + @Field + public void setIncludeDeleted(boolean includeDeleted) { + defaultConfig.setIncludeDeleted(includeDeleted); + } + + @Field + public void setMaxEmails(int maxEmails) { + defaultConfig.setMaxEmails(maxEmails); + } + + +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java new file mode 100644 index 000000000..c2573c8ed --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/LibPstParserConfig.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.libpst; + +public class LibPstParserConfig { + + private long timeoutSeconds = 600; + /** + * In initial tests, setting this to true resulted in more emails + * being extracted. It did dramatically slow down processing time. :( + */ + private boolean isDebug = true; + + /** + * Should readpst also output msg files for processing. + * In an initial test, not as many attachments were extracted from msg files. + * Not yet clear if that is a POI limitation or a problem with libpst + */ + private boolean processEmailAsMsg = true; + + private boolean includeDeleted = true; + + /** + * max emails to process. Will process everything if this value is < 0 + */ + private int maxEmails = -1; + + public long getTimeoutSeconds() { + return timeoutSeconds; + } + + public void setTimeoutSeconds(long timeoutSeconds) { + this.timeoutSeconds = timeoutSeconds; + } + + public boolean isDebug() { + return isDebug; + } + + public void setDebug(boolean debug) { + isDebug = debug; + } + + public boolean isProcessEmailAsMsg() { + return processEmailAsMsg; + } + + public void setProcessEmailAsMsg(boolean processEmailAsMsg) { + this.processEmailAsMsg = processEmailAsMsg; + } + + public boolean isIncludeDeleted() { + return includeDeleted; + } + + public void setIncludeDeleted(boolean includeDeleted) { + this.includeDeleted = includeDeleted; + } + + public int getMaxEmails() { + return maxEmails; + } + + public void setMaxEmails(int maxEmails) { + this.maxEmails = maxEmails; + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java new file mode 100644 index 000000000..4bda2bbd4 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.libpst; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assumptions.assumeTrue; + +import java.util.List; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Message; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.PST; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; + +public class TestLibPstParser extends TikaTest { + + private static boolean LIBPST_EXISTS = false; + + @BeforeAll + public static void setUp() { + LIBPST_EXISTS = LibPstParser.checkQuietly(); + } + + @Test + public void testBasic() throws Exception { + if (!LIBPST_EXISTS) { + return; + } + TikaConfig tikaConfig = new TikaConfig(TestLibPstParser.class.getResourceAsStream("tika-libpst-config.xml")); + Parser p = new AutoDetectParser(tikaConfig); + List<Metadata> metadataList = getRecursiveMetadata("testPST.pst", p); + //libpst is non-deterministic when creating msg files -- sometimes we get 7, sometimes 8 + assumeTrue(metadataList.size() == 8); + + Metadata m0 = metadataList.get(0); + assertEquals("org.apache.tika.parser.microsoft.libpst.LibPstParser", m0.getValues(TikaCoreProperties.TIKA_PARSED_BY)[1]); + int validPaths = 0; + for (int i = 1; i < metadataList.size(); i++) { + String path = metadataList + .get(i) + .get(PST.PST_FOLDER_PATH); + if (path != null) { + assertEquals("hong-thai.nguyen", path); + validPaths++; + } + } + //NOTE: this processing via lib pst misses an email (with an ooxml attachment) embedded inside an email + assertEquals(7, validPaths); + + assertEquals("Hong-Thai Nguyen", metadataList + .get(1) + .get(Message.MESSAGE_TO_DISPLAY_NAME)); + assertContains("See you there!", metadataList + .get(1) + .get(TikaCoreProperties.TIKA_CONTENT)); + + assertEquals("NOTE", metadataList + .get(7) + .get(Office.MAPI_MESSAGE_CLASS)); + } + + @Test + public void testEml() throws Exception { + if (!LIBPST_EXISTS) { + return; + } + TikaConfig tikaConfig = new TikaConfig(TestLibPstParser.class.getResourceAsStream("tika-libpst-eml-config.xml")); + Parser p = new AutoDetectParser(tikaConfig); + + List<Metadata> metadataList = getRecursiveMetadata("testPST.pst", p); + assertEquals(10, metadataList.size()); + Metadata m0 = metadataList.get(0); + assertEquals("org.apache.tika.parser.microsoft.libpst.LibPstParser", m0.getValues(TikaCoreProperties.TIKA_PARSED_BY)[1]); + int validPaths = 0; + for (int i = 1; i < metadataList.size(); i++) { + String path = metadataList + .get(i) + .get(PST.PST_FOLDER_PATH); + if (path != null) { + assertEquals("hong-thai.nguyen", path); + validPaths++; + } + } + assertEquals(7, validPaths); + assertContains("See you there!", metadataList + .get(3) + .get(TikaCoreProperties.TIKA_CONTENT)); + + assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", metadataList + .get(4) + .get(Metadata.CONTENT_TYPE)); + } + +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-config.xml new file mode 100644 index 000000000..c2e3b5150 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-config.xml @@ -0,0 +1,26 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"> + <parser-exclude class="org.apache.tika.parser.microsoft.pst.OutlookPSTParser"/> + <parser-exclude class="org.apache.tika.parser.microsoft.pst.PSTMailItemParser"/> + </parser> + <parser class="org.apache.tika.parser.microsoft.libpst.LibPstParser"/> + </parsers> +</properties> \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-eml-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-eml-config.xml new file mode 100644 index 000000000..7be83be4f --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/libpst/tika-libpst-eml-config.xml @@ -0,0 +1,30 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"> + <parser-exclude class="org.apache.tika.parser.microsoft.pst.OutlookPSTParser"/> + <parser-exclude class="org.apache.tika.parser.microsoft.pst.PSTMailItemParser"/> + </parser> + <parser class="org.apache.tika.parser.microsoft.libpst.LibPstParser"> + <params> + <param name="processEmailAsMsg" type="bool">false</param> + </params> + </parser> + </parsers> +</properties> \ No newline at end of file
