This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 17826b53e TIKA-4518 (#2366)
17826b53e is described below
commit 17826b53ef1c114a426c55b4422baee4acf5043b
Author: Tim Allison <[email protected]>
AuthorDate: Wed Oct 15 13:34:36 2025 -0400
TIKA-4518 (#2366)
* TIKA-4518 -- improve pst handling with -Z option
---
CHANGES.txt | 14 ++----
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 4 +-
.../java/org/apache/tika/cli/TikaCLIAsyncTest.java | 2 +-
.../test/java/org/apache/tika/cli/TikaCLITest.java | 29 ++++++++++-
tika-app/src/test/resources/test-data/testPST.pst | Bin 0 -> 2302976 bytes
.../extractor/DefaultEmbeddedStreamTranslator.java | 21 ++++----
.../tika/extractor/EmbeddedStreamTranslator.java | 8 +--
.../apache/tika/extractor/RUnpackExtractor.java | 36 +++++++++-----
.../java/org/apache/tika/io/FilenameUtils.java | 29 ++++++++++-
.../java/org/apache/tika/io/FilenameUtilsTest.java | 8 ++-
.../microsoft/MSEmbeddedStreamTranslator.java | 39 ++++++---------
.../microsoft/PSTEmailStreamTranslator.java | 55 +++++++++++++++++++++
....apache.tika.extractor.EmbeddedStreamTranslator | 3 +-
.../AbstractEmbeddedDocumentBytesHandler.java | 37 ++------------
.../server/core/resource/UnpackerResource.java | 27 ++++------
15 files changed, 194 insertions(+), 118 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 6de20938a..2d91e48c2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,17 +7,11 @@ Release 4.0.0-BETA1 - ???
* Headers are no longer injected into the body/content of MSG files
(TIKA-4345). Please open
a ticket if you need this behavior across email formats.
- * Remove tika-batch (TIKA-4333).
+ * Removed several modules, including: tika-batch (TIKA-4333), snaps
deployment (TIKA-4502),
+ dotnet (TIKA-4332), advanced media module (TIKA-4500), tika-dl module
(TIKA-4499),
+ tika-fuzzing module (TIKA-4506).
- * Remove snaps deployment (TIKA-4502).
-
- * Removed the dotnet module (TIKA-4332).
-
- * Removed the advanced media module (TIKA-4500).
-
- * Removed the tika-dl module (TIKA-4499).
-
- * Removed the tika-fuzzing module (TIKA-4506).
+ * API changes in the EmbeddedStreamTranslator (TIKA-4518).
OTHER CHANGES
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 28a9b29c7..a1db5f8bf 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -1112,9 +1112,7 @@ public class TikaCLI {
try (OutputStream os = Files.newOutputStream(outputFile)) {
if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
- try (InputStream translated =
embeddedStreamTranslator.translate(tis, metadata)) {
- IOUtils.copy(translated, os);
- }
+ embeddedStreamTranslator.translate(tis, metadata, os);
} else {
IOUtils.copy(tis, os);
}
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
index 096e3ce73..faacd49a2 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
@@ -121,7 +121,7 @@ public class TikaCLIAsyncTest {
json++;
}
}
- assertEquals(20, json);
+ assertEquals(21, json);
}
private void checkForPrettyPrint(File f) throws IOException {
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 93de4e409..94ccfd96c 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -26,6 +26,7 @@ import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
+import java.io.Reader;
import java.net.URI;
import java.nio.file.FileVisitResult;
import java.nio.file.FileVisitor;
@@ -34,6 +35,7 @@ import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
import org.jetbrains.annotations.NotNull;
@@ -44,7 +46,11 @@ import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.serialization.JsonMetadataList;
import org.apache.tika.utils.ProcessUtils;
+import org.apache.tika.utils.StringUtils;
/**
* Tests the Tika's cli
@@ -285,6 +291,28 @@ public class TikaCLITest {
testRecursiveUnpack("testPDFPackage.pdf", expectedChildren, 2);
}
+ @Test
+ public void testPSTRUnpack() throws Exception {
+ String[] expectedChildren = new String[]{"testPST.pst.json",
+ "testPST.pst-embed/00000007-First email.msg",
+ "testPST.pst-embed/00000001-Feature Generators.msg",
+ "testPST.pst-embed/00000008-First email.msg",
+ "testPST.pst-embed/00000004-[jira] [Resolved] (TIKA-1249)
Vcard files detection.msg",
+ "testPST.pst-embed/00000003-Feature Generators.msg",
+ "testPST.pst-embed/00000002-putstatic\".msg",
+ "testPST.pst-embed/00000005-[jira] [Commented] (TIKA-1250)
Process loops infintely processing a CHM file.msg",
+ "testPST.pst-embed/00000009-attachment.docx",
+ "testPST.pst-embed/00000006-[WEBINAR] - \"Introducing
Couchbase Server 2.5\".msg"};
+ testRecursiveUnpack("testPST.pst", expectedChildren, 2);
+ try (Reader reader =
Files.newBufferedReader(extractDir.resolve("testPST.pst.json"))) {
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ for (Metadata m : metadataList) {
+ String content = m.get(TikaCoreProperties.TIKA_CONTENT);
+ assertFalse(StringUtils.isBlank(content));
+ }
+ }
+ }
+
/**
* Tests -l option of the cli
@@ -378,7 +406,6 @@ public class TikaCLITest {
.list();
assertNotNull(jsonFile);
assertEquals(expectedLength, jsonFile.length);
- //assertEquals(fileNames.size(), expectedChildrenFileNames.length);
for (String expectedChildName : expectedChildrenFileNames) {
assertTrue(fileNames.contains(expectedChildName));
diff --git a/tika-app/src/test/resources/test-data/testPST.pst
b/tika-app/src/test/resources/test-data/testPST.pst
new file mode 100644
index 000000000..8ccc69547
Binary files /dev/null and b/tika-app/src/test/resources/test-data/testPST.pst
differ
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
index 537c5ffa1..b9d6985cc 100644
---
a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
+++
b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
@@ -17,18 +17,17 @@
package org.apache.tika.extractor;
import java.io.IOException;
-import java.io.InputStream;
+import java.io.OutputStream;
import java.util.List;
import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.utils.ServiceLoaderUtils;
/**
* Loads EmbeddedStreamTranslators via service loading. Tries to run each
- * in turn and returns the first non-null value. If no translation has
occurred,
- * this returns the original InputStream. If a translation has occurred, the
- * translator will consume the InputStream but not close it.
+ * in turn. If a translator accepts the stream, it will do the translation but
not close the stream.
*/
public class DefaultEmbeddedStreamTranslator implements
EmbeddedStreamTranslator {
@@ -58,7 +57,7 @@ public class DefaultEmbeddedStreamTranslator implements
EmbeddedStreamTranslator
* @throws IOException
*/
@Override
- public boolean shouldTranslate(InputStream inputStream, Metadata metadata)
throws IOException {
+ public boolean shouldTranslate(TikaInputStream inputStream, Metadata
metadata) throws IOException {
for (EmbeddedStreamTranslator translator : translators) {
if (translator.shouldTranslate(inputStream, metadata)) {
return true;
@@ -68,20 +67,20 @@ public class DefaultEmbeddedStreamTranslator implements
EmbeddedStreamTranslator
}
/**
- * This will consume the InputStream and return a new stream of translated
bytes.
+ * This will consume the InputStream and write the stream to the output
stream
* @param inputStream
* @param metadata
+ * @param outputStream to write to
* @return
* @throws IOException
*/
@Override
- public InputStream translate(InputStream inputStream, Metadata metadata)
throws IOException {
+ public void translate(TikaInputStream inputStream, Metadata metadata,
OutputStream outputStream) throws IOException {
for (EmbeddedStreamTranslator translator : translators) {
- InputStream translated = translator.translate(inputStream,
metadata);
- if (translated != null) {
- return translated;
+ if (translator.shouldTranslate(inputStream, metadata)) {
+ translator.translate(inputStream, metadata, outputStream);
+ return;
}
}
- return inputStream;
}
}
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
index b2ce05db4..4a582506f 100644
---
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
+++
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
@@ -17,8 +17,9 @@
package org.apache.tika.extractor;
import java.io.IOException;
-import java.io.InputStream;
+import java.io.OutputStream;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
/**
@@ -30,9 +31,8 @@ import org.apache.tika.metadata.Metadata;
*/
public interface EmbeddedStreamTranslator {
- boolean shouldTranslate(InputStream inputStream, Metadata metadata) throws
IOException;
+ boolean shouldTranslate(TikaInputStream inputStream, Metadata metadata)
throws IOException;
- InputStream translate(InputStream inputStream,
- Metadata metadata) throws IOException;
+ void translate(TikaInputStream inputStream, Metadata metadata,
OutputStream os) throws IOException;
}
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
index 42544dc80..70c21ffb4 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
@@ -21,6 +21,7 @@ import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
+import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
@@ -110,18 +111,27 @@ public class RUnpackExtractor extends
ParsingEmbeddedDocumentExtractor {
}
}
- private void parseWithBytes(TikaInputStream stream, ContentHandler
handler, Metadata metadata)
- throws TikaException, IOException, SAXException {
- //TODO -- improve the efficiency of this so that we're not
- //literally writing out a file per request
+ private void parseWithBytes(TikaInputStream tis, ContentHandler handler,
Metadata metadata) throws TikaException, IOException, SAXException {
+
Path tmp = Files.createTempFile("tika-tmp-", ".bin");
- if (embeddedStreamTranslator.shouldTranslate(stream, metadata)) {
- Files.copy(embeddedStreamTranslator.translate(stream, metadata),
tmp, StandardCopyOption.REPLACE_EXISTING);
- } else {
- Files.copy(stream, tmp, StandardCopyOption.REPLACE_EXISTING);
- }
- try (TikaInputStream tmpTis = TikaInputStream.get(tmp)) {
- parse(tmpTis, handler, metadata);
+ try {
+ //translate the stream or not
+ if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
+ try (OutputStream os = Files.newOutputStream(tmp)) {
+ embeddedStreamTranslator.translate(tis, metadata, os);
+ }
+ } else {
+ Files.copy(tis, tmp, StandardCopyOption.REPLACE_EXISTING);
+ }
+
+ //now do the parse
+ if (tis.getOpenContainer() != null) {
+ parse(tis, handler, metadata);
+ } else {
+ try (TikaInputStream tisTmp = TikaInputStream.get(tmp)) {
+ parse(tisTmp, handler, metadata);
+ }
+ }
} finally {
try {
storeEmbeddedBytes(tmp, metadata);
@@ -139,6 +149,10 @@ public class RUnpackExtractor extends
ParsingEmbeddedDocumentExtractor {
}
private void storeEmbeddedBytes(Path p, Metadata metadata) {
+ if (p == null) {
+ return;
+ }
+
if (! embeddedBytesSelector.select(metadata)) {
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("skipping embedded bytes {} <-> {}",
diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
index de57eda72..cf250e934 100644
--- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
@@ -140,12 +140,16 @@ public class FilenameUtils {
public static String getSanitizedEmbeddedFileName(Metadata metadata,
String defaultExtension,
int maxLength) {
- String path = getEmbeddedPath(metadata);
+ String path = getEmbeddedName(metadata);
//fName could be a full path or null
if (StringUtils.isBlank(path)) {
return null;
}
path = path.replaceAll("\u0000", " ");
+ if (path.startsWith("\"") && path.endsWith("\"")) {
+ path = path.substring(1, path.length() - 1);
+ }
+
int prefixLength = getPrefixLength(path);
if (prefixLength > 0) {
path = path.substring(prefixLength);
@@ -173,6 +177,7 @@ public class FilenameUtils {
namePart = namePart.replaceAll("(\\.\\.)+", "_");
namePart = namePart.replaceAll("[/\\\\]+", "_");
namePart = namePart.replaceAll(":+", "_");
+ namePart = namePart.trim();
if (StringUtils.isBlank(namePart)) {
return null;
@@ -286,6 +291,7 @@ public class FilenameUtils {
return path;
}
+ //may return null
private static String getEmbeddedPath(Metadata metadata) {
//potentially look for other values in embedded path or original file
name, etc...
//maybe different fallback order?
@@ -304,6 +310,27 @@ public class FilenameUtils {
return metadata.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME);
}
+ //this tries for resource name first, and then backs off to path
+ private static String getEmbeddedName(Metadata metadata) {
+ //potentially look for other values in embedded path or original file
name, etc...
+ //maybe different fallback order?
+ String path = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+ if (! StringUtils.isBlank(path)) {
+ return path;
+ }
+ path = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
+ if (! StringUtils.isBlank(path)) {
+ return path;
+ }
+
+ path = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
+ if (! StringUtils.isBlank(path)) {
+ return path;
+ }
+
+ return metadata.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME);
+ }
+
/**
* Calculate the extension based on the {@link Metadata#CONTENT_TYPE}
value.
* On parse exception or null value, return the default value.
diff --git a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
index c3abd4134..c670bac83 100644
--- a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
@@ -151,6 +151,9 @@ public class FilenameUtilsTest {
assertEquals("brown fox.xlsx", sanitizeFilename("a:/the quick:brown
fox.xlsx"));
assertEquals("_the quick brown fox.xlsx",
sanitizeFilename("C:\\a/b/c/..the quick brown fox.xlsx"));
assertEquals("_the quick brown fox.xlsx",
sanitizeFilename("~/a/b/c/.the quick brown fox.xlsx"));
+ assertEquals("the quick%3Ebrown fox.xlsx", sanitizeFilename("the
quick>brown fox.xlsx"));
+ assertEquals("the quick\"brown fox.xlsx", sanitizeFilename("the
quick\"brown fox.xlsx"));
+ assertEquals("the quick brown fox.xlsx", sanitizeFilename("\"the quick
brown fox.xlsx\""));
assertEquals("_.docx", sanitizeFilename("..................docx"));
assertEquals("_.docx", sanitizeFilename("..docx"));
@@ -168,7 +171,7 @@ public class FilenameUtilsTest {
@Test
public void testEmbeddedFilePaths() throws Exception {
String n = "the quick brown fox.docx";
- /*assertEquals(n, sanitizePath(n));
+ assertEquals(n, sanitizePath(n));
assertEquals(n, sanitizePath(n.substring(0, n.length() - 5),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
assertEquals(n, sanitizeFilename("the quick\u0000brown fox.docx"));
@@ -204,7 +207,7 @@ public class FilenameUtilsTest {
assertNull(sanitizePath(""));
assertNull(sanitizePath(null));
assertNull(sanitizePath("/"));
- assertNull(sanitizePath("~/"));*/
+ assertNull(sanitizePath("~/"));
assertNull(sanitizePath("C:"));
assertNull(sanitizePath("C:/"));
assertNull(sanitizePath("C:\\"));
@@ -235,6 +238,7 @@ public class FilenameUtilsTest {
private Metadata getMetadata(String name) {
Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, name);
return metadata;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
index 24f7ec2d3..433d34a00 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
@@ -18,9 +18,10 @@ package org.apache.tika.extractor.microsoft;
import java.io.IOException;
import java.io.InputStream;
+import java.io.OutputStream;
import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.commons.io.output.CloseShieldOutputStream;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
@@ -43,26 +44,22 @@ public class MSEmbeddedStreamTranslator implements
EmbeddedStreamTranslator {
private static final Logger LOG =
LoggerFactory.getLogger(MSEmbeddedStreamTranslator.class);
@Override
- public boolean shouldTranslate(InputStream inputStream, Metadata metadata)
throws IOException {
+ public boolean shouldTranslate(TikaInputStream tis, Metadata metadata)
throws IOException {
String contentType =
metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
if
("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType))
{
return true;
- } else if (inputStream instanceof TikaInputStream) {
- TikaInputStream tin = (TikaInputStream) inputStream;
- if (tin.getOpenContainer() != null &&
- tin.getOpenContainer() instanceof DirectoryEntry) {
- return true;
- }
+ } else {
+ return tis.getOpenContainer() != null &&
+ tis.getOpenContainer() instanceof DirectoryEntry;
}
- return false;
}
@Override
- public InputStream translate(InputStream inputStream, Metadata metadata)
throws IOException {
+ public void translate(TikaInputStream tis, Metadata metadata, OutputStream
os) throws IOException {
String contentType =
metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
if
("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType))
{
UnsynchronizedByteArrayOutputStream bos =
UnsynchronizedByteArrayOutputStream.builder().get();
- IOUtils.copy(inputStream, bos);
+ IOUtils.copy(tis, bos);
POIFSFileSystem poifs = new POIFSFileSystem(bos.toInputStream());
OfficeParser.POIFSDocumentType type =
OfficeParser.POIFSDocumentType.detectType(poifs);
String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
@@ -82,21 +79,17 @@ public class MSEmbeddedStreamTranslator implements
EmbeddedStreamTranslator {
name += '.' + type.getExtension();
}
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
- return
UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get();
- } else if (inputStream instanceof TikaInputStream) {
- TikaInputStream tin = (TikaInputStream) inputStream;
-
- if (tin.getOpenContainer() != null &&
- tin.getOpenContainer() instanceof DirectoryEntry) {
- POIFSFileSystem fs = new POIFSFileSystem();
- copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
- try (UnsynchronizedByteArrayOutputStream bos2 =
UnsynchronizedByteArrayOutputStream.builder().get()) {
- fs.writeFilesystem(bos2);
- return bos2.toInputStream();
+ os.write(data);
+ os.flush();
+ } else {
+ if (tis.getOpenContainer() != null &&
+ tis.getOpenContainer() instanceof DirectoryEntry) {
+ try (POIFSFileSystem fs = new POIFSFileSystem()) {
+ copy((DirectoryEntry) tis.getOpenContainer(),
fs.getRoot());
+ fs.writeFilesystem(CloseShieldOutputStream.wrap(os));
}
}
}
- return inputStream;
}
protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
throws IOException {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/PSTEmailStreamTranslator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/PSTEmailStreamTranslator.java
new file mode 100644
index 000000000..055072481
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/PSTEmailStreamTranslator.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.extractor.microsoft;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.extractor.EmbeddedStreamTranslator;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+
+public class PSTEmailStreamTranslator implements EmbeddedStreamTranslator {
+ private static final String MIME_TYPE =
MediaType.application("x-tika-pst-mail-item").toString();
+
+ private static final Logger LOG =
LoggerFactory.getLogger(PSTEmailStreamTranslator.class);
+ private static final AtomicLong EMAIL_ITEMS = new AtomicLong(0);
+ private static final long LOG_EVERY_X_ITEMS = 100;
+
+ @Override
+ public boolean shouldTranslate(TikaInputStream tis, Metadata metadata)
throws IOException {
+ return
MIME_TYPE.equals(metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE))
+ || MIME_TYPE.equals(metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ @Override
+ public void translate(TikaInputStream tis, Metadata metadata, OutputStream
os) throws IOException {
+ if (!shouldTranslate(tis, metadata)) {
+ return;
+ }
+ if (EMAIL_ITEMS.getAndIncrement() % LOG_EVERY_X_ITEMS == 0) {
+ LOG.warn("Translating pst email objects to .eml or .msg is not yet
supported. "
+ + "Please open a ticket on our JIRA or a pull request on
Github.");
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
index e59cba80e..509de7d95 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
@@ -12,4 +12,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-org.apache.tika.extractor.microsoft.MSEmbeddedStreamTranslator
\ No newline at end of file
+org.apache.tika.extractor.microsoft.MSEmbeddedStreamTranslator
+org.apache.tika.extractor.microsoft.PSTEmailStreamTranslator
\ No newline at end of file
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java
index 80ff66984..5dd27e419 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java
@@ -18,26 +18,18 @@ package org.apache.tika.pipes.core.extractor;
import java.io.IOException;
import java.io.InputStream;
-import java.nio.file.Path;
-import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
-import org.apache.tika.config.TikaConfig;
import org.apache.tika.extractor.EmbeddedDocumentBytesHandler;
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.mime.MimeTypes;
-import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig;
import org.apache.tika.utils.StringUtils;
public abstract class AbstractEmbeddedDocumentBytesHandler implements
EmbeddedDocumentBytesHandler {
- private static final MimeTypes MIME_TYPES =
TikaConfig.getDefaultConfig().getMimeRepository();
-
List<Integer> ids = new ArrayList<>();
public String getEmitKey(String containerEmitKey, int embeddedId,
@@ -56,13 +48,10 @@ public abstract class AbstractEmbeddedDocumentBytesHandler
implements EmbeddedDo
emitKey.append("-embed");
emitKey.append("/");
emitKey.append(embeddedIdString).append(embeddedDocumentBytesConfig.getEmbeddedIdPrefix());
- Path p =
Paths.get(metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
- String fName = p.getFileName().toString();
- emitKey.append(fName);
- if (! fName.contains(".")) {
- appendSuffix(emitKey, metadata, embeddedDocumentBytesConfig);
+ String fName =
FilenameUtils.getSanitizedEmbeddedFileName(metadata, ".bin", 100);
+ if (! StringUtils.isBlank(fName)) {
+ emitKey.append(fName);
}
-
return emitKey.toString();
} else if (embeddedDocumentBytesConfig.getKeyBaseStrategy() ==
EmbeddedDocumentBytesConfig.KEY_BASE_STRATEGY.CONTAINER_NAME_NUMBERED) {
@@ -101,25 +90,7 @@ public abstract class AbstractEmbeddedDocumentBytesHandler
implements EmbeddedDo
emitKey.append(suffix);
} else if (embeddedDocumentBytesConfig.getSuffixStrategy()
.equals(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.DETECTED)) {
- emitKey.append(getExtension(metadata));
+ emitKey.append(FilenameUtils.calculateExtension(metadata, ".bin"));
}
}
-
- private String getExtension(Metadata metadata) {
- String mime = metadata.get(Metadata.CONTENT_TYPE);
- try {
- String ext = MIME_TYPES
- .forName(mime)
- .getExtension();
- if (ext == null) {
- return ".bin";
- } else {
- return ext;
- }
- } catch (MimeTypeException e) {
- //swallow
- }
- return ".bin";
-
- }
}
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
index a2e3064d6..df8611d2c 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
@@ -43,7 +43,6 @@ import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.mutable.MutableInt;
@@ -193,12 +192,16 @@ public class UnpackerResource {
.builder()
.get();
- BoundedInputStream bis = new BoundedInputStream(unpackMaxBytes,
tis);
- IOUtils.copy(bis, bos);
- if (bis.hasHitBound()) {
- throw new IOException(new TikaMemoryLimitException(
- "An attachment is longer than " + "'unpackMaxBytes'
(default=100MB, actual=" + unpackMaxBytes + "). " + "If you need to increase
this " +
- "limit, add a header to your request, such as:
unpackMaxBytes: " + "1073741824. There is a hard limit of 2GB."));
+ if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
+ embeddedStreamTranslator.translate(tis, metadata, bos);
+ } else {
+ BoundedInputStream bis = new
BoundedInputStream(unpackMaxBytes, tis);
+ IOUtils.copy(bis, bos);
+ if (bis.hasHitBound()) {
+ throw new IOException(new TikaMemoryLimitException(
+ "An attachment is longer than " +
"'unpackMaxBytes' (default=100MB, actual=" + unpackMaxBytes + "). " + "If you
need to increase this " +
+ "limit, add a header to your request, such
as: unpackMaxBytes: " + "1073741824. There is a hard limit of 2GB."));
+ }
}
byte[] data = bos.toByteArray();
@@ -224,16 +227,6 @@ public class UnpackerResource {
LOG.warn("Unexpected MimeTypeException", e);
}
}
- try (TikaInputStream is = TikaInputStream.get(data)) {
- if (embeddedStreamTranslator.shouldTranslate(is, metadata)) {
- InputStream translated =
embeddedStreamTranslator.translate(UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(),
metadata);
- UnsynchronizedByteArrayOutputStream bos2 =
UnsynchronizedByteArrayOutputStream
- .builder()
- .get();
- IOUtils.copy(translated, bos2);
- data = bos2.toByteArray();
- }
- }
final String finalName = getFinalName(name, zout);