TIKA-2159 -- first step
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ab009aeb Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ab009aeb Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ab009aeb Branch: refs/heads/2.x Commit: ab009aeb7bb9966972d78827136567e90cfae67c Parents: f2661f9 Author: tballison <[email protected]> Authored: Wed Nov 9 22:05:02 2016 -0500 Committer: tballison <[email protected]> Committed: Wed Nov 9 22:05:02 2016 -0500 ---------------------------------------------------------------------- .../tika/extractor/EmbeddedDocumentUtil.java | 168 +++++++++++++++++++ .../tika/parser/jdbc/AbstractDBParser.java | 7 - .../tika/parser/jdbc/JDBCTableReader.java | 52 ++---- .../tika/parser/pdf/AbstractPDF2XHTML.java | 64 ++++--- .../org/apache/tika/parser/pdf/PDF2XHTML.java | 31 ++-- .../org/apache/tika/parser/pdf/PDFParser.java | 7 +- .../parser/apple/AppleSingleFileParser.java | 8 +- .../org/apache/tika/parser/mbox/MboxParser.java | 5 +- .../tika/parser/mbox/OutlookPSTParser.java | 5 +- .../microsoft/AbstractPOIFSExtractor.java | 46 ++--- .../tika/parser/microsoft/HSLFExtractor.java | 42 ++--- .../tika/parser/microsoft/OfficeParser.java | 11 +- .../tika/parser/microsoft/TNEFParser.java | 11 +- .../microsoft/ooxml/AbstractOOXMLExtractor.java | 12 +- .../tika/parser/microsoft/xml/WordMLParser.java | 9 +- .../tika/parser/rtf/RTFEmbObjHandler.java | 61 +------ .../tika/parser/pkg/CompressorParser.java | 7 +- .../apache/tika/parser/pkg/PackageParser.java | 6 +- .../org/apache/tika/parser/pkg/RarParser.java | 13 +- .../tika/parser/xml/FictionBookParser.java | 21 +-- .../tika/parser/mail/MailContentHandler.java | 30 +--- .../tika/parser/mail/RFC822ParserTest.java | 25 ++- 22 files changed, 337 insertions(+), 304 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java new file mode 100644 index 0000000..3ceba90 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.extractor; + + +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.Detector; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MimeType; +import org.apache.tika.mime.MimeTypeException; +import org.apache.tika.mime.MimeTypes; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.utils.ExceptionUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Utility class to handle common issues with embedded documents. + * <p/> + * Use statically if all that is needed is getting the EmbeddedDocumentExtractor. + * Otherwise, instantiate an instance. + * <p/> + * Note: This is not thread safe. Make sure to instantiate one per thread. + */ +public class EmbeddedDocumentUtil implements Serializable { + + private final ParseContext context; + private final EmbeddedDocumentExtractor embeddedDocumentExtractor; + //these are lazily initialized and can be null + private TikaConfig tikaConfig; + private MimeTypes mimeTypes; + private Detector detector; + + public EmbeddedDocumentUtil(ParseContext context) { + this.context = context; + this.embeddedDocumentExtractor = getEmbeddedDocumentExtractor(context); + } + + public static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) { + EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class); + if (extractor == null) { + extractor = new ParsingEmbeddedDocumentExtractor(context); + } + return extractor; + } + + + public PasswordProvider getPasswordProvider() { + return context.get(PasswordProvider.class); + } + + public Detector getDetector() { + //be as lazy as possible and cache the detector + if (detector == null) { + detector = context.get(Detector.class); + if (detector == null) { + detector = getTikaConfig().getDetector(); + } + } + return detector; + } + + public MimeTypes getMimeTypes() { + //be as lazy as possible and cache the mimeTypes + if (mimeTypes == null) { + mimeTypes = context.get(MimeTypes.class); + if (mimeTypes == null) { + mimeTypes = getTikaConfig().getMimeRepository(); + } + } + return mimeTypes; + } + + public TikaConfig getTikaConfig() { + //be as lazy as possible and cache the TikaConfig + if (tikaConfig == null) { + tikaConfig = context.get(TikaConfig.class); + if (tikaConfig == null) { + tikaConfig = TikaConfig.getDefaultConfig(); + } + } + return tikaConfig; + } + + public String getExtension(TikaInputStream is, Metadata metadata) { + String mimeString = metadata.get(Metadata.CONTENT_TYPE); + TikaConfig config = getConfig(); + MimeType mimeType = null; + MimeTypes types = config.getMimeRepository(); + boolean detected = false; + if (mimeString != null) { + try { + mimeType = types.forName(mimeString); + } catch (MimeTypeException e) { + //swallow + } + } + if (mimeType == null) { + Detector detector = config.getDetector(); + try { + MediaType mediaType = detector.detect(is, metadata); + mimeType = types.forName(mediaType.toString()); + detected = true; + is.reset(); + } catch (IOException e) { + //swallow + } catch (MimeTypeException e) { + //swallow + } + } + if (mimeType != null) { + if (detected) { + //set or correct the mime type + metadata.set(Metadata.CONTENT_TYPE, mimeType.toString()); + } + return mimeType.getExtension(); + } + return ".bin"; + } + + public TikaConfig getConfig() { + TikaConfig config = context.get(TikaConfig.class); + if (config == null) { + config = TikaConfig.getDefaultConfig(); + } + return config; + } + + public static void recordException(Throwable t, Metadata m) { + String ex = ExceptionUtils.getFilteredStackTrace(t); + m.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ex); + } + + public boolean shouldParseEmbedded(Metadata m) { + return getEmbeddedDocumentExtractor().shouldParseEmbedded(m); + } + + private EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() { + return embeddedDocumentExtractor; + } + + public void parseEmbedded(InputStream inputStream, ContentHandler handler, + Metadata metadata, boolean outputHtml) throws IOException, SAXException { + embeddedDocumentExtractor.parseEmbedded(inputStream, handler, metadata, outputHtml); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java b/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java index bba14a0..d613dc5 100644 --- a/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java +++ b/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java @@ -26,8 +26,6 @@ import java.util.Set; import org.apache.commons.io.IOExceptionWithCause; import org.apache.tika.exception.TikaException; -import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; import org.apache.tika.metadata.Database; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -46,11 +44,6 @@ abstract class AbstractDBParser extends AbstractParser { private Connection connection; - protected static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) { - return context.get(EmbeddedDocumentExtractor.class, - new ParsingEmbeddedDocumentExtractor(context)); - } - @Override public Set<MediaType> getSupportedTypes(ParseContext context) { return null; http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java b/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java index ec2470f..f6691e0 100644 --- a/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java +++ b/tika-parser-modules/tika-parser-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java @@ -21,7 +21,6 @@ import static java.nio.charset.StandardCharsets.UTF_8; import java.io.ByteArrayInputStream; import java.io.IOException; -import java.io.InputStream; import java.sql.Blob; import java.sql.Clob; import java.sql.Connection; @@ -38,14 +37,11 @@ import org.apache.commons.io.IOExceptionWithCause; import org.apache.commons.io.IOUtils; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; -import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Database; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMetadataKeys; -import org.apache.tika.mime.MediaType; -import org.apache.tika.mime.MimeType; -import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; @@ -65,14 +61,11 @@ class JDBCTableReader { int maxClobLength = 1000000; ResultSet results = null; int rows = 0; - private TikaConfig tikaConfig = null; - private Detector detector = null; - private MimeTypes mimeTypes = null; - + private final EmbeddedDocumentUtil embeddedDocumentUtil; public JDBCTableReader(Connection connection, String tableName, ParseContext context) { this.connection = connection; this.tableName = tableName; - this.tikaConfig = context.get(TikaConfig.class); + embeddedDocumentUtil = new EmbeddedDocumentUtil(context); } public boolean nextRow(ContentHandler handler, ParseContext context) throws IOException, SAXException { @@ -204,8 +197,9 @@ class JDBCTableReader { //is there a more efficient way to go from a Reader to an InputStream? String s = clob.getSubString(0, readSize); - EmbeddedDocumentExtractor ex = AbstractDBParser.getEmbeddedDocumentExtractor(context); - ex.parseEmbedded(new ByteArrayInputStream(s.getBytes(UTF_8)), handler, m, true); + if (embeddedDocumentUtil.shouldParseEmbedded(m)) { + embeddedDocumentUtil.parseEmbedded(new ByteArrayInputStream(s.getBytes(UTF_8)), handler, m, true); + } } protected void handleBlob(String tableName, String columnName, int rowNum, ResultSet resultSet, int columnIndex, @@ -216,8 +210,7 @@ class JDBCTableReader { m.set(Database.PREFIX + "ROW_NUM", Integer.toString(rowNum)); m.set(Database.PREFIX + "IS_BLOB", "true"); Blob blob = null; - InputStream is = null; - EmbeddedDocumentExtractor ex = AbstractDBParser.getEmbeddedDocumentExtractor(context); + TikaInputStream is = null; try { blob = getBlob(resultSet, columnIndex, m); if (blob == null) { @@ -229,20 +222,14 @@ class JDBCTableReader { ((AttributesImpl) attrs).addAttribute("", "column_name", "column_name", "CDATA", columnName); ((AttributesImpl) attrs).addAttribute("", "row_number", "row_number", "CDATA", Integer.toString(rowNum)); handler.startElement("", "span", "span", attrs); - MediaType mediaType = getDetector().detect(is, new Metadata()); - String extension = ""; - try { - MimeType mimeType = getMimeTypes().forName(mediaType.toString()); - m.set(Metadata.CONTENT_TYPE, mimeType.toString()); - extension = mimeType.getExtension(); - } catch (MimeTypeException e) { - //swallow - } + String extension = embeddedDocumentUtil.getExtension(is, m); + m.set(TikaMetadataKeys.RESOURCE_NAME_KEY, //just in case something screwy is going on with the column name FilenameUtils.normalize(FilenameUtils.getName(columnName + "_" + rowNum + extension))); - - ex.parseEmbedded(is, handler, m, true); + if (embeddedDocumentUtil.shouldParseEmbedded(m)) { + embeddedDocumentUtil.parseEmbedded(is, handler, m, true); + } } finally { if (blob != null) { @@ -315,24 +302,15 @@ class JDBCTableReader { protected TikaConfig getTikaConfig() { - if (tikaConfig == null) { - tikaConfig = TikaConfig.getDefaultConfig(); - } - return tikaConfig; + return embeddedDocumentUtil.getTikaConfig(); } protected Detector getDetector() { - if (detector != null) return detector; - - detector = getTikaConfig().getDetector(); - return detector; + return embeddedDocumentUtil.getDetector(); } protected MimeTypes getMimeTypes() { - if (mimeTypes != null) return mimeTypes; - - mimeTypes = getTikaConfig().getMimeRepository(); - return mimeTypes; + return embeddedDocumentUtil.getMimeTypes(); } } http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index 44e7032..c175138 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -66,7 +66,7 @@ import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.tools.imageio.ImageIOUtil; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -101,7 +101,8 @@ class AbstractPDF2XHTML extends PDFTextStripper { final PDDocument pdDocument; final XHTMLContentHandler xhtml; private final ParseContext context; - private final Metadata metadata; + final Metadata metadata; + final EmbeddedDocumentExtractor embeddedDocumentExtractor; final PDFParserConfig config; private int pageIndex = 0; @@ -113,6 +114,7 @@ class AbstractPDF2XHTML extends PDFTextStripper { this.context = context; this.metadata = metadata; this.config = config; + embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); } @Override @@ -125,15 +127,6 @@ class AbstractPDF2XHTML extends PDFTextStripper { writeParagraphStart(); } - EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() { - EmbeddedDocumentExtractor extractor = - context.get(EmbeddedDocumentExtractor.class); - if (extractor == null) { - extractor = new ParsingEmbeddedDocumentExtractor(context); - } - return extractor; - } - private void extractEmbeddedDocuments(PDDocument document) throws IOException, SAXException, TikaException { PDDocumentNameDictionary namesDictionary = @@ -170,31 +163,28 @@ class AbstractPDF2XHTML extends PDFTextStripper { return; } - EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) { PDComplexFileSpecification spec = ent.getValue(); - extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor); + extractMultiOSPDEmbeddedFiles(ent.getKey(), spec); } } private void extractMultiOSPDEmbeddedFiles(String displayName, - PDComplexFileSpecification spec, - EmbeddedDocumentExtractor extractor) throws IOException, + PDComplexFileSpecification spec) throws IOException, SAXException, TikaException { if (spec == null) { return; } //current strategy is to pull all, not just first non-null - extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile(), extractor); - extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac(), extractor); - extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos(), extractor); - extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor); + extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile()); + extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac()); + extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos()); + extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix()); } private void extractPDEmbeddedFile(String displayName, String unicodeFileName, - String fileName, PDEmbeddedFile file, - EmbeddedDocumentExtractor extractor) + String fileName, PDEmbeddedFile file) throws SAXException, IOException, TikaException { if (file == null) { @@ -205,22 +195,30 @@ class AbstractPDF2XHTML extends PDFTextStripper { fileName = (fileName == null) ? displayName : fileName; // TODO: other metadata? - Metadata metadata = new Metadata(); - metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); - metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); - metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); - metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + Metadata embeddedMetadata = new Metadata(); + embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName); + embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); + embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); + embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); - metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName); + embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName); - if (extractor.shouldParseEmbedded(metadata)) { + if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { TikaInputStream stream = null; try { - stream = TikaInputStream.get(file.createInputStream()); - extractor.parseEmbedded( + + InputStream rawStream = null; + try { + rawStream = file.createInputStream(); + } catch (IOException e) { + EmbeddedDocumentUtil.recordException(e, metadata); + return; + } + stream = TikaInputStream.get(rawStream); + embeddedDocumentExtractor.parseEmbedded( stream, new EmbeddedContentHandler(xhtml), - metadata, false); + embeddedMetadata, false); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); @@ -291,14 +289,13 @@ class AbstractPDF2XHTML extends PDFTextStripper { protected void endPage(PDPage page) throws IOException { try { - EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { - extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, extractor); + extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { @@ -457,6 +454,7 @@ class AbstractPDF2XHTML extends PDFTextStripper { return; } catch (XMLStreamException |IOException e) { //if there was an xml parse exception in xfa, try the AcroForm + EmbeddedDocumentUtil.recordException(e, metadata); } } http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java index ac9823e..0ae8137 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -46,7 +46,7 @@ import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.tools.imageio.ImageIOUtil; import org.apache.tika.exception.TikaException; -import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; @@ -188,15 +188,15 @@ class PDF2XHTML extends AbstractPDF2XHTML { PDImageXObject image = (PDImageXObject) object; - Metadata metadata = new Metadata(); + Metadata embeddedMetadata = new Metadata(); String extension = image.getSuffix(); - if (extension == null) { - metadata.set(Metadata.CONTENT_TYPE, "image/png"); + if (extension == null || extension.equals("png")) { + embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/png"); extension = "png"; } else if (extension.equals("jpg")) { - metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); + embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); } else if (extension.equals("tiff")) { - metadata.set(Metadata.CONTENT_TYPE, "image/tiff"); + embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff"); extension = "tif"; } else { //TODO: determine if we need to add more image types @@ -208,7 +208,7 @@ class PDF2XHTML extends AbstractPDF2XHTML { imageNumber = inlineImageCounter++; } String fileName = "image" + imageNumber + "."+extension; - metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); + embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName); // Output the img tag AttributesImpl attr = new AttributesImpl(); @@ -226,20 +226,23 @@ class PDF2XHTML extends AbstractPDF2XHTML { processedInlineImages.put(cosStream, imageNumber); } - metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); - EmbeddedDocumentExtractor extractor = - getEmbeddedDocumentExtractor(); - if (extractor.shouldParseEmbedded(metadata)) { + if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); try { //TODO: handle image.getMetadata()? - writeToBuffer(image, extension, buffer); - extractor.parseEmbedded( + try { + writeToBuffer(image, extension, buffer); + } catch (IOException e) { + EmbeddedDocumentUtil.recordException(e, metadata); + return; + } + embeddedDocumentExtractor.parseEmbedded( new ByteArrayInputStream(buffer.toByteArray()), new EmbeddedContentHandler(xhtml), - metadata, false); + embeddedMetadata, false); } catch (IOException e) { handleCatchableIOE(e); } http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 185af6a..a2e2a74 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -46,6 +46,7 @@ import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.AccessPermissions; import org.apache.tika.metadata.Metadata; @@ -207,7 +208,7 @@ public class PDFParser extends AbstractParser { //now go for the XMP - Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), context); + Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context); XMPMetadata xmp = null; if (dom != null) { @@ -654,7 +655,7 @@ public class PDFParser extends AbstractParser { } //can return null! - private Document loadDOM(PDMetadata pdMetadata, ParseContext context) { + private Document loadDOM(PDMetadata pdMetadata, Metadata parentMetadata, ParseContext context) { if (pdMetadata == null) { return null; } @@ -663,7 +664,7 @@ public class PDFParser extends AbstractParser { documentBuilder.setErrorHandler((ErrorHandler)null); return documentBuilder.parse(is); } catch (IOException|SAXException|TikaException e) { - //swallow + EmbeddedDocumentUtil.recordException(e, parentMetadata); } return null; http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java index 0f3c044..fa41554 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java @@ -29,7 +29,7 @@ import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.EndianUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -79,11 +79,7 @@ public class AppleSingleFileParser extends AbstractParser { Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); - - if (ex == null) { - ex = new ParsingEmbeddedDocumentExtractor(context); - } + EmbeddedDocumentExtractor ex = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); short numEntries = readThroughNumEntries(stream); long bytesRead = 26; http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java index 83e26da..a82d74e 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java @@ -37,7 +37,7 @@ import java.util.regex.Pattern; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -81,8 +81,7 @@ public class MboxParser extends AbstractParser { public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException { - EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class, - new ParsingEmbeddedDocumentExtractor(context)); + EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); String charsetName = "windows-1252"; http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java index 5883bd5..dee17db 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java @@ -32,7 +32,7 @@ import com.pff.PSTFolder; import com.pff.PSTMessage; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -69,8 +69,7 @@ public class OutlookPSTParser extends AbstractParser { throws IOException, SAXException, TikaException { // Use the delegate parser to parse the contained document - EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class, - new ParsingEmbeddedDocumentExtractor(context)); + EmbeddedDocumentExtractor embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString()); http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java index a71be5b..725ce8b 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java @@ -33,8 +33,7 @@ import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.detect.DetectorProxy; import org.apache.tika.exception.TikaException; -import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -50,11 +49,8 @@ import org.xml.sax.SAXException; abstract class AbstractPOIFSExtractor { private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class); - private final EmbeddedDocumentExtractor extractor; + private final EmbeddedDocumentUtil embeddedDocumentUtil; private PasswordProvider passwordProvider; - private TikaConfig tikaConfig; - private MimeTypes mimeTypes; - private Detector detector; private Metadata metadata; private final Detector zipDetectorProxy; @@ -63,42 +59,28 @@ abstract class AbstractPOIFSExtractor { } protected AbstractPOIFSExtractor(ParseContext context, Metadata metadata) { - EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); - - if (ex == null) { - this.extractor = new ParsingEmbeddedDocumentExtractor(context); - } else { - this.extractor = ex; - } + embeddedDocumentUtil = new EmbeddedDocumentUtil(context); this.passwordProvider = context.get(PasswordProvider.class); - this.tikaConfig = context.get(TikaConfig.class); - this.mimeTypes = context.get(MimeTypes.class); - this.detector = context.get(Detector.class); this.metadata = metadata; this.zipDetectorProxy = new DetectorProxy("org.apache.tika.parser.pkg.ZipContainerDetector", getClass().getClassLoader()); } // Note - these cache, but avoid creating the default TikaConfig if not needed protected TikaConfig getTikaConfig() { - if (tikaConfig == null) { - tikaConfig = TikaConfig.getDefaultConfig(); - } - return tikaConfig; + return embeddedDocumentUtil.getTikaConfig(); } protected Detector getDetector() { - if (detector != null) return detector; - - detector = getTikaConfig().getDetector(); - return detector; + return embeddedDocumentUtil.getDetector(); } + /** + * @deprecated use {@link #embeddedDocumentUtil} + * @return mimetypes + */ protected MimeTypes getMimeTypes() { - if (mimeTypes != null) return mimeTypes; - - mimeTypes = getTikaConfig().getMimeRepository(); - return mimeTypes; + return embeddedDocumentUtil.getMimeTypes(); } /** @@ -139,8 +121,8 @@ abstract class AbstractPOIFSExtractor { metadata.set(Metadata.CONTENT_TYPE, mediaType); } - if (extractor.shouldParseEmbedded(metadata)) { - extractor.parseEmbedded(resource, xhtml, metadata, outputHtml); + if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) { + embeddedDocumentUtil.parseEmbedded(resource, xhtml, metadata, outputHtml); } } finally { resource.close(); @@ -249,7 +231,7 @@ abstract class AbstractPOIFSExtractor { } // Should we parse it? - if (extractor.shouldParseEmbedded(metadata)) { + if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) { if (embedded == null) { // Make a TikaInputStream that just // passes the root directory of the @@ -258,7 +240,7 @@ abstract class AbstractPOIFSExtractor { embedded = TikaInputStream.get(new byte[0]); embedded.setOpenContainer(dir); } - extractor.parseEmbedded(embedded, xhtml, metadata, true); + embeddedDocumentUtil.parseEmbedded(embedded, xhtml, metadata, true); } } finally { if (embedded != null) { http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java index ce0ede7..64ec813 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java @@ -17,12 +17,12 @@ package org.apache.tika.parser.microsoft; import java.io.IOException; +import java.io.InputStream; import java.util.HashSet; import java.util.List; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.poi.common.usermodel.Hyperlink; -import org.apache.poi.hslf.exceptions.HSLFException; import org.apache.poi.hslf.model.Comment; import org.apache.poi.hslf.model.HeadersFooters; import org.apache.poi.hslf.model.OLEShape; @@ -41,6 +41,7 @@ import org.apache.poi.hslf.usermodel.HSLFTextShape; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TaggedIOException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -51,8 +52,11 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; public class HSLFExtractor extends AbstractPOIFSExtractor { - public HSLFExtractor(ParseContext context) { + private final Metadata metadata; + + public HSLFExtractor(ParseContext context, Metadata metadata) { super(context); + this.metadata = metadata; } protected void parse( @@ -330,17 +334,17 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { break; } - try (TikaInputStream picIs = TikaInputStream.get(pic.getData())){ + byte[] data = null; + try { + data = pic.getData(); + } catch (Exception e) { + EmbeddedDocumentUtil.recordException(e, metadata); + continue; + } + try (TikaInputStream picIs = TikaInputStream.get(data)){ handleEmbeddedResource( picIs, null, null, mediaType, xhtml, false); - } catch (HSLFException e) { - if (e.getMessage() != null && e.getMessage().contains("incorrect data check")) { - //TIKA-2157 - //swallow - } else { - throw e; - } } } } @@ -378,8 +382,14 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { attributes.addAttribute("", "id", "id", "CDATA", objID); xhtml.startElement("div", attributes); xhtml.endElement("div"); - - try (TikaInputStream stream = TikaInputStream.get(data.getData())) { + InputStream dataStream = null; + try { + dataStream = data.getData(); + } catch (Exception e) { + EmbeddedDocumentUtil.recordException(e, metadata); + continue; + } + try (TikaInputStream stream = TikaInputStream.get(dataStream)) { String mediaType = null; if ("Excel.Chart.8".equals(oleShape.getProgID())) { mediaType = "application/vnd.ms-excel"; @@ -397,13 +407,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { mediaType, xhtml, false); } } catch (TaggedIOException e) { - if ("incorrect data check".equals(e.getMessage())) { - //TIKA-2130 - //some embedded objects can't be uncompressed correctly - //swallow - } else { - throw e; - } + EmbeddedDocumentUtil.recordException(e, metadata); } } } http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java index f7f1c4a..5218dfa 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java @@ -43,7 +43,7 @@ import org.apache.poi.util.IOUtils; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -130,11 +130,8 @@ public class OfficeParser extends AbstractParser { parse(root, context, metadata, xhtml); //now try to get macros - EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); - if (ex == null) { - ex = new ParsingEmbeddedDocumentExtractor(context); - } - extractMacros(root.getNFileSystem(), xhtml, ex); + extractMacros(root.getNFileSystem(), xhtml, + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)); } finally { IOUtils.closeQuietly(mustCloseFs); } @@ -169,7 +166,7 @@ public class OfficeParser extends AbstractParser { new WordExtractor(context, metadata).parse(root, xhtml); break; case POWERPOINT: - new HSLFExtractor(context).parse(root, xhtml); + new HSLFExtractor(context, metadata).parse(root, xhtml); break; case WORKBOOK: case XLR: http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java index 879546b..484f0c5 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java @@ -30,7 +30,7 @@ import org.apache.poi.hmef.attribute.MAPIRtfAttribute; import org.apache.poi.hsmf.datatypes.MAPIProperty; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -68,13 +68,8 @@ public class TNEFParser extends AbstractParser { throws IOException, SAXException, TikaException { // We work by recursing, so get the appropriate bits - EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); - EmbeddedDocumentExtractor embeddedExtractor; - if (ex == null) { - embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); - } else { - embeddedExtractor = ex; - } + EmbeddedDocumentExtractor embeddedExtractor = + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); // Ask POI to process the file for us HMEFMessage msg = new HMEFMessage(stream); http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index 1f16a3c..f9ba8a6 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -39,7 +39,7 @@ import org.apache.poi.poifs.filesystem.Ole10NativeException; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -75,15 +75,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) { this.extractor = extractor; - - EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); - - if (ex == null) { - embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); - } else { - embeddedExtractor = ex; - } - + embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); } /** http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java index 8334c67..364d81e 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java @@ -28,7 +28,7 @@ import java.util.concurrent.ConcurrentHashMap; import org.apache.commons.codec.binary.Base64; import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -87,16 +87,13 @@ public class WordMLParser extends AbstractXML2003Parser { Metadata metadata, ParseContext context) { EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); - if (ex == null) { - ex = new ParsingEmbeddedDocumentExtractor(context); - } - return new TeeContentHandler( super.getContentHandler(ch, metadata, context), new WordMLHandler(ch), new HyperlinkHandler(ch, WORD_ML_URL), - new PictHandler(ch, ex)); + new PictHandler(ch, + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context))); } @Override http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java index 1334906..dbdc842 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java @@ -23,18 +23,11 @@ import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; -import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.RTFMetadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.mime.MimeType; -import org.apache.tika.mime.MimeTypeException; -import org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.EmbeddedContentHandler; import org.xml.sax.ContentHandler; @@ -64,9 +57,7 @@ class RTFEmbObjHandler { private static final String EMPTY_STRING = ""; private final ContentHandler handler; - - - private final ParseContext context; + private final EmbeddedDocumentUtil embeddedDocumentUtil; private final ByteArrayOutputStream os; //high hex cached for writing hexpair chars (data) private int hi = -1; @@ -81,7 +72,7 @@ class RTFEmbObjHandler { private EMB_STATE state = EMB_STATE.NADA; protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, ParseContext context) { this.handler = handler; - this.context = context; + this.embeddedDocumentUtil = new EmbeddedDocumentUtil(context); os = new ByteArrayOutputStream(); } @@ -170,18 +161,14 @@ class RTFEmbObjHandler { * @throws TikaException */ protected void handleCompletedObject() throws IOException, SAXException, TikaException { - EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class); - if (embeddedExtractor == null) { - embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); - } byte[] bytes = os.toByteArray(); if (state == EMB_STATE.OBJDATA) { RTFObjDataParser objParser = new RTFObjDataParser(); try { byte[] objBytes = objParser.parse(bytes, metadata, unknownFilenameCount); - extractObj(objBytes, handler, embeddedExtractor, metadata); + extractObj(objBytes, handler, metadata); } catch (IOException e) { //swallow. If anything goes wrong, ignore. } @@ -192,7 +179,7 @@ class RTFEmbObjHandler { metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath)); } metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject)); - extractObj(bytes, handler, embeddedExtractor, metadata); + extractObj(bytes, handler, metadata); } else if (state == EMB_STATE.NADA) { //swallow...no start for pict or embed?! @@ -200,8 +187,7 @@ class RTFEmbObjHandler { reset(); } - private void extractObj(byte[] bytes, ContentHandler handler, - EmbeddedDocumentExtractor embeddedExtractor, Metadata metadata) + private void extractObj(byte[] bytes, ContentHandler handler, Metadata metadata) throws SAXException, IOException, TikaException { if (bytes == null) { @@ -210,11 +196,10 @@ class RTFEmbObjHandler { metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length)); - if (embeddedExtractor.shouldParseEmbedded(metadata)) { + if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) { TikaInputStream stream = TikaInputStream.get(bytes); if (metadata.get(Metadata.RESOURCE_NAME_KEY) == null) { - String extension = getExtension(stream, metadata); - stream.reset(); + String extension = embeddedDocumentUtil.getExtension(stream, metadata); if (inObject && state == EMB_STATE.PICT) { metadata.set(Metadata.RESOURCE_NAME_KEY, "thumbnail_" + thumbCount++ + extension); metadata.set(RTFMetadata.THUMBNAIL, "true"); @@ -224,7 +209,7 @@ class RTFEmbObjHandler { } } try { - embeddedExtractor.parseEmbedded( + embeddedDocumentUtil.parseEmbedded( stream, new EmbeddedContentHandler(handler), metadata, false); @@ -234,34 +219,6 @@ class RTFEmbObjHandler { } } - private String getExtension(TikaInputStream is, Metadata metadata) { - String cType = metadata.get(Metadata.CONTENT_TYPE); - TikaConfig config = getConfig(); - if (cType == null) { - Detector detector = config.getDetector(); - try { - MediaType mediaType = detector.detect(is, metadata); - MimeTypes types = config.getMimeRepository(); - MimeType mime = types.forName(mediaType.toString()); - metadata.set(Metadata.CONTENT_TYPE, mime.toString()); - return mime.getExtension(); - } catch (IOException e) { - //swallow - } catch (MimeTypeException e) { - - } - } - return ".bin"; - } - - private TikaConfig getConfig() { - TikaConfig config = context.get(TikaConfig.class); - if (config == null) { - config = TikaConfig.getDefaultConfig(); - } - return config; - } - /** * reset state after each object. * Do not reset unknown file number. http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java index 84b3b11..c4cd8de 100644 --- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java +++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java @@ -38,7 +38,7 @@ import org.apache.commons.compress.compressors.z.ZCompressorInputStream; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; @@ -157,9 +157,8 @@ public class CompressorParser extends AbstractParser { } // Use the delegate parser to parse the compressed document - EmbeddedDocumentExtractor extractor = context.get( - EmbeddedDocumentExtractor.class, - new ParsingEmbeddedDocumentExtractor(context)); + EmbeddedDocumentExtractor extractor = + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); if (extractor.shouldParseEmbedded(entrydata)) { extractor.parseEmbedded(cis, xhtml, entrydata, true); } http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java index 443eb9e..370efe6 100644 --- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java +++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java @@ -43,7 +43,7 @@ import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -168,9 +168,7 @@ public class PackageParser extends AbstractParser { metadata.set(CONTENT_TYPE, type.toString()); } // Use the delegate parser to parse the contained document - EmbeddedDocumentExtractor extractor = context.get( - EmbeddedDocumentExtractor.class, - new ParsingEmbeddedDocumentExtractor(context)); + EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java index 99508b0..cf80e47 100644 --- a/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java +++ b/tika-parser-modules/tika-parser-package-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java @@ -21,10 +21,13 @@ import java.io.InputStream; import java.util.Collections; import java.util.Set; +import com.github.junrar.Archive; +import com.github.junrar.exception.RarException; +import com.github.junrar.rarfile.FileHeader; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -35,10 +38,6 @@ import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import com.github.junrar.Archive; -import com.github.junrar.exception.RarException; -import com.github.junrar.rarfile.FileHeader; - /** * Parser for Rar files. */ @@ -61,9 +60,7 @@ public class RarParser extends AbstractParser { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); - EmbeddedDocumentExtractor extractor = context.get( - EmbeddedDocumentExtractor.class, - new ParsingEmbeddedDocumentExtractor(context)); + EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); Archive rar = null; try (TemporaryResources tmp = new TemporaryResources()) { http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java index e79bbfc..bf06a08 100644 --- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java +++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java @@ -16,9 +16,14 @@ */ package org.apache.tika.parser.xml; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.Collections; +import java.util.Set; + import org.apache.commons.codec.binary.Base64; import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.mime.MediaType; @@ -28,11 +33,6 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.util.Collections; -import java.util.Set; - public class FictionBookParser extends XMLParser { private static final long serialVersionUID = 4195954546491524374L; @@ -43,13 +43,8 @@ public class FictionBookParser extends XMLParser { @Override protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) { - EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); - - if (ex == null) { - ex = new ParsingEmbeddedDocumentExtractor(context); - } - - return new BinaryElementsDataHandler(ex, handler); + return new BinaryElementsDataHandler( + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), handler); } private static class BinaryElementsDataHandler extends DefaultHandler { http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java index 458ed01..00cc6d8 100644 --- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java +++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java @@ -47,16 +47,12 @@ import org.apache.james.mime4j.field.LenientFieldParser; import org.apache.james.mime4j.parser.ContentHandler; import org.apache.james.mime4j.stream.BodyDescriptor; import org.apache.james.mime4j.stream.Field; -import org.apache.tika.config.TikaConfig; import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; @@ -141,29 +137,7 @@ class MailContentHandler implements ContentHandler { // to handle/process the parts/attachments // Was an EmbeddedDocumentExtractor explicitly supplied? - this.extractor = context.get(EmbeddedDocumentExtractor.class); - - // If there's no EmbeddedDocumentExtractor, then try using a normal parser - // This will ensure that the contents are made available to the user, so - // the see the text, but without fine-grained control/extraction - // (This also maintains backward compatibility with older versions!) - if (this.extractor == null) { - // If the user gave a parser, use that, if not the default - Parser parser = context.get(AutoDetectParser.class); - if (parser == null) { - parser = context.get(Parser.class); - } - if (parser == null) { - TikaConfig tikaConfig = context.get(TikaConfig.class); - if (tikaConfig == null) { - tikaConfig = TikaConfig.getDefaultConfig(); - } - parser = new AutoDetectParser(tikaConfig.getParser()); - } - ParseContext ctx = new ParseContext(); - ctx.set(Parser.class, parser); - extractor = new ParsingEmbeddedDocumentExtractor(ctx); - } + this.extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); } public void body(BodyDescriptor body, InputStream is) throws MimeException, http://git-wip-us.apache.org/repos/asf/tika/blob/ab009aeb/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java index 4a506be..877f40f 100644 --- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java +++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java @@ -77,9 +77,11 @@ public class RFC822ParserTest extends TikaTest { Metadata metadata = new Metadata(); InputStream stream = getStream("test-documents/testRFC822"); ContentHandler handler = mock(DefaultHandler.class); + ParseContext context = new ParseContext(); + context.set(Parser.class, new AutoDetectParser()); try { - parser.parse(stream, handler, metadata, new ParseContext()); + parser.parse(stream, handler, metadata, context); verify(handler).startDocument(); //just one body verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class)); @@ -105,9 +107,10 @@ public class RFC822ParserTest extends TikaTest { Metadata metadata = new Metadata(); InputStream stream = getStream("test-documents/testRFC822-multipart"); ContentHandler handler = mock(XHTMLContentHandler.class); - + ParseContext context = new ParseContext(); + context.set(Parser.class, new AutoDetectParser()); try { - parser.parse(stream, handler, metadata, new ParseContext()); + parser.parse(stream, handler, metadata, context); verify(handler).startDocument(); int bodyExpectedTimes = 4, multipackExpectedTimes = 5; verify(handler, times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class)); @@ -124,7 +127,7 @@ public class RFC822ParserTest extends TikaTest { stream = getStream("test-documents/testRFC822-multipart"); handler = new BodyContentHandler(); try { - parser.parse(stream, handler, metadata, new ParseContext()); + parser.parse(stream, handler, metadata, context); //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode String bodyText = handler.toString(); assertTrue(bodyText.contains("body 1")); @@ -141,9 +144,11 @@ public class RFC822ParserTest extends TikaTest { Metadata metadata = new Metadata(); InputStream stream = getStream("test-documents/testRFC822_quoted"); ContentHandler handler = new BodyContentHandler(); + ParseContext context = new ParseContext(); + context.set(Parser.class, new AutoDetectParser()); try { - parser.parse(stream, handler, metadata, new ParseContext()); + parser.parse(stream, handler, metadata, context); //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode String bodyText = handler.toString(); assertTrue(bodyText.contains("D\u00FCsseldorf has non-ascii.")); @@ -161,9 +166,11 @@ public class RFC822ParserTest extends TikaTest { Metadata metadata = new Metadata(); InputStream stream = getStream("test-documents/testRFC822_base64"); ContentHandler handler = new BodyContentHandler(); + ParseContext context = new ParseContext(); + context.set(Parser.class, new AutoDetectParser()); try { - parser.parse(stream, handler, metadata, new ParseContext()); + parser.parse(stream, handler, metadata, context); //tests correct decoding of base64 text, including ISO-8859-1 bytes into Unicode assertContains("Here is some text, with international characters, voil\u00E0!", handler.toString()); } catch (Exception e) { @@ -256,8 +263,10 @@ public class RFC822ParserTest extends TikaTest { Metadata metadata = new Metadata(); InputStream stream = getStream("test-documents/testRFC822-limitedheaders"); ContentHandler handler = new BodyContentHandler(); + ParseContext context = new ParseContext(); + context.set(Parser.class, new AutoDetectParser()); - parser.parse(stream, handler, metadata, new ParseContext()); + parser.parse(stream, handler, metadata, context); assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR)); assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]); assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]); @@ -282,6 +291,7 @@ public class RFC822ParserTest extends TikaTest { Parser parser = new RFC822Parser(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); + context.set(Parser.class, new AutoDetectParser()); InputStream stream = getStream("test-documents/testRFC822_encrypted_zip"); ContentHandler handler = new BodyContentHandler(); parser.parse(stream, handler, metadata, context); @@ -337,6 +347,7 @@ public class RFC822ParserTest extends TikaTest { Parser parser = new RFC822Parser(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); + context.set(Parser.class, new AutoDetectParser()); InputStream stream = getStream("test-documents/testRFC822_normal_zip"); ContentHandler handler = new BodyContentHandler(); parser.parse(stream, handler, metadata, context);
