This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push: new 5f05b51 TIKA-2644 - refactor recursiveparserwrapper api 5f05b51 is described below commit 5f05b511d7d1184f6f25a2b644b615c4f21b8e68 Author: tballison <talli...@mitre.org> AuthorDate: Thu May 17 14:09:36 2018 -0400 TIKA-2644 - refactor recursiveparserwrapper api --- CHANGES.txt | 3 + .../src/main/java/org/apache/tika/cli/TikaCLI.java | 8 +- .../src/main/java/org/apache/tika/gui/TikaGUI.java | 14 +- .../tika/cli/TikaCLIBatchIntegrationTest.java | 3 +- .../apache/tika/batch/fs/BasicTikaFSConsumer.java | 52 +++-- .../batch/fs/RecursiveParserWrapperFSConsumer.java | 42 ++-- .../fs/builders/BasicTikaFSConsumersBuilder.java | 7 +- .../RecursiveParserWrapperFSConsumerTest.java | 19 +- .../apache/tika/parser/RecursiveParserWrapper.java | 257 +++++++++++++-------- .../sax/AbstractRecursiveParserWrapperHandler.java | 119 ++++++++++ .../tika/sax/BasicContentHandlerFactory.java | 67 +++--- .../org/apache/tika/sax/ContentHandlerFactory.java | 6 + .../tika/sax/RecursiveParserWrapperHandler.java | 120 ++++++++++ .../org/apache/tika/MultiThreadedTikaTest.java | 36 +-- .../src/test/java/org/apache/tika/TikaTest.java | 32 ++- .../org/apache/tika/eval/AbstractProfiler.java | 13 +- .../java/org/apache/tika/eval/ExtractComparer.java | 11 +- .../java/org/apache/tika/eval/ExtractProfiler.java | 3 +- .../org/apache/tika/eval/io/ExtractReader.java | 7 +- .../org/apache/tika/eval/SimpleComparerTest.java | 15 +- .../org/apache/tika/eval/io/ExtractReaderTest.java | 25 +- .../org/apache/tika/example/ParsingExample.java | 10 +- .../multiple/PickBestTextEncodingParser.java | 5 + .../tika/parser/RecursiveParserWrapperTest.java | 121 +++++++--- .../apache/tika/parser/html/HtmlParserTest.java | 13 +- .../apache/tika/parser/jdbc/SQLite3ParserTest.java | 24 +- .../apache/tika/parser/mail/RFC822ParserTest.java | 27 +-- .../apache/tika/parser/mbox/MboxParserTest.java | 5 +- .../tika/parser/microsoft/JackcessParserTest.java | 13 +- .../tika/parser/ocr/TesseractOCRParserTest.java | 3 +- .../org/apache/tika/parser/pdf/PDFParserTest.java | 21 +- .../tika/parser/pkg/CompressorParserTest.java | 3 +- .../org/apache/tika/parser/pkg/ZipParserTest.java | 3 +- .../org/apache/tika/parser/rtf/RTFParserTest.java | 12 +- .../serialization/PrettyMetadataKeyComparator.java | 2 +- .../server/resource/RecursiveMetadataResource.java | 25 +- .../tika/server/RecursiveMetadataResourceTest.java | 23 +- 37 files changed, 829 insertions(+), 340 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index b24df29..38f1973 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -5,6 +5,9 @@ Release 2.0.0 - ??? Other changes + * Add the RecursiveParserWrapperHandler to improve the RecursiveParserWrapper + API slightly (TIKA-2644). + * Upgrade jackson to 2.9.5 (TIKA-2634). * Add support for brotli (TIKA-2621). diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 2af7e04..399152d 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -102,6 +102,7 @@ import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.ExpandedTitleContentHandler; +import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.xmp.XMPMetadata; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -498,14 +499,15 @@ public class TikaCLI { private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException { Metadata metadata = new Metadata(); - RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, getContentHandlerFactory(type)); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1); try (InputStream input = TikaInputStream.get(url, metadata)) { - wrapper.parse(input, null, metadata, context); + wrapper.parse(input, handler, metadata, context); } JsonMetadataList.setPrettyPrinting(prettyPrint); Writer writer = getOutputWriter(output, encoding); try { - JsonMetadataList.toJson(wrapper.getMetadata(), writer); + JsonMetadataList.toJson(handler.getMetadataList(), writer); } finally { writer.flush(); } diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java index bfec921..3f40dd3 100644 --- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java +++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java @@ -81,6 +81,7 @@ import org.apache.tika.parser.utils.CommonsDigester; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; +import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.Attributes; @@ -396,13 +397,16 @@ public class TikaGUI extends JFrame ); } if (isReset) { - RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, - new BasicContentHandlerFactory( - BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1)); - wrapper.parse(input, null, new Metadata(), new ParseContext()); + RecursiveParserWrapperHandler recursiveParserWrapperHandler = + new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1), + -1); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); + wrapper.parse(input, recursiveParserWrapperHandler, new Metadata(), new ParseContext()); StringWriter jsonBuffer = new StringWriter(); JsonMetadataList.setPrettyPrinting(true); - JsonMetadataList.toJson(wrapper.getMetadata(), jsonBuffer); + JsonMetadataList.toJson(recursiveParserWrapperHandler.getMetadataList(), jsonBuffer); setText(json, jsonBuffer.toString()); } layout.show(cards, "metadata"); diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java index 1da91db..60c6d6b 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java @@ -35,6 +35,7 @@ import org.apache.commons.io.FileUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.serialization.JsonMetadataList; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -108,7 +109,7 @@ public class TikaCLIBatchIntegrationTest { try (Reader reader = Files.newBufferedReader(jsonFile, UTF_8)) { List<Metadata> metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); - assertTrue(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).contains("human events")); + assertTrue(metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).contains("human events")); } } diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java index 087a482..5ccab17 100644 --- a/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java +++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java @@ -22,6 +22,8 @@ import static java.nio.charset.StandardCharsets.UTF_8; import java.io.InputStream; import java.io.OutputStream; import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.concurrent.ArrayBlockingQueue; import org.apache.commons.io.IOUtils; @@ -45,29 +47,45 @@ import org.xml.sax.ContentHandler; public class BasicTikaFSConsumer extends AbstractFSConsumer { private boolean parseRecursively = true; - private final ParserFactory parserFactory; + private final Parser parser; private final ContentHandlerFactory contentHandlerFactory; private final OutputStreamFactory fsOSFactory; - private final TikaConfig config; - private String outputEncoding = UTF_8.toString(); - + private Charset outputEncoding = StandardCharsets.UTF_8; + + /** + * @param queue + * @param parserFactory + * @param contentHandlerFactory + * @param fsOSFactory + * @param tikaConfig + * + * @deprecated use {@link BasicTikaFSConsumer#BasicTikaFSConsumer(ArrayBlockingQueue, Parser, ContentHandlerFactory, OutputStreamFactory)} + */ + @Deprecated public BasicTikaFSConsumer(ArrayBlockingQueue<FileResource> queue, ParserFactory parserFactory, ContentHandlerFactory contentHandlerFactory, - OutputStreamFactory fsOSFactory, - TikaConfig config) { + OutputStreamFactory fsOSFactory, TikaConfig tikaConfig) { + super(queue); + this.parser = parserFactory.getParser(tikaConfig); + this.contentHandlerFactory = contentHandlerFactory; + this.fsOSFactory = fsOSFactory; + } + + public BasicTikaFSConsumer(ArrayBlockingQueue<FileResource> queue, + Parser parser, + ContentHandlerFactory contentHandlerFactory, + OutputStreamFactory fsOSFactory) { super(queue); - this.parserFactory = parserFactory; + this.parser = parser; this.contentHandlerFactory = contentHandlerFactory; this.fsOSFactory = fsOSFactory; - this.config = config; } @Override public boolean processFileResource(FileResource fileResource) { - Parser parser = parserFactory.getParser(config); ParseContext context = new ParseContext(); if (parseRecursively) { context.set(Parser.class, parser); @@ -87,14 +105,8 @@ public class BasicTikaFSConsumer extends AbstractFSConsumer { return false; } ContentHandler handler; - try { - handler = contentHandlerFactory.getNewContentHandler(os, getOutputEncoding()); - } catch (UnsupportedEncodingException e) { - incrementHandledExceptions(); - LOG.error(getXMLifiedLogMsg("output_encoding_ex", fileResource.getResourceId(), e)); - flushAndClose(os); - throw new RuntimeException(e); - } + handler = contentHandlerFactory.getNewContentHandler(os, getOutputEncoding()); + //now actually call parse! Throwable thrown = null; @@ -115,11 +127,11 @@ public class BasicTikaFSConsumer extends AbstractFSConsumer { return true; } - public String getOutputEncoding() { + public Charset getOutputEncoding() { return outputEncoding; } - public void setOutputEncoding(String outputEncoding) { - this.outputEncoding = outputEncoding; + public void setOutputEncoding(Charset charset) { + this.outputEncoding = charset; } } diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java index f95bcbb..259157f 100644 --- a/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java +++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java @@ -37,6 +37,7 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.sax.ContentHandlerFactory; +import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.utils.ExceptionUtils; import org.xml.sax.helpers.DefaultHandler; @@ -49,36 +50,46 @@ import org.xml.sax.helpers.DefaultHandler; */ public class RecursiveParserWrapperFSConsumer extends AbstractFSConsumer { - - private final ParserFactory parserFactory; + private final Parser parser; private final ContentHandlerFactory contentHandlerFactory; private final OutputStreamFactory fsOSFactory; - private final TikaConfig tikaConfig; private String outputEncoding = "UTF-8"; + /** + * @deprecated use {@link RecursiveParserWrapperFSConsumer#RecursiveParserWrapperFSConsumer(ArrayBlockingQueue, Parser, ContentHandlerFactory, OutputStreamFactory)} + * @param queue + * @param parserFactory + * @param contentHandlerFactory + * @param fsOSFactory + * @param config + */ public RecursiveParserWrapperFSConsumer(ArrayBlockingQueue<FileResource> queue, ParserFactory parserFactory, ContentHandlerFactory contentHandlerFactory, - OutputStreamFactory fsOSFactory, TikaConfig tikaConfig) { + OutputStreamFactory fsOSFactory, TikaConfig config) { super(queue); - this.parserFactory = parserFactory; this.contentHandlerFactory = contentHandlerFactory; this.fsOSFactory = fsOSFactory; - this.tikaConfig = tikaConfig; + Parser parserToWrap = parserFactory.getParser(config); + this.parser = new RecursiveParserWrapper(parserToWrap, contentHandlerFactory); + } + + public RecursiveParserWrapperFSConsumer(ArrayBlockingQueue<FileResource> queue, + Parser parserToWrap, + ContentHandlerFactory contentHandlerFactory, + OutputStreamFactory fsOSFactory) { + super(queue); + this.contentHandlerFactory = contentHandlerFactory; + this.fsOSFactory = fsOSFactory; + this.parser = new RecursiveParserWrapper(parserToWrap, contentHandlerFactory); } @Override public boolean processFileResource(FileResource fileResource) { - Parser wrapped = parserFactory.getParser(tikaConfig); - RecursiveParserWrapper parser = new RecursiveParserWrapper(wrapped, contentHandlerFactory); ParseContext context = new ParseContext(); -// if (parseRecursively == true) { - context.set(Parser.class, parser); -// } - //try to open outputstream first OutputStream os = getOutputStream(fsOSFactory, fileResource); @@ -100,13 +111,14 @@ public class RecursiveParserWrapperFSConsumer extends AbstractFSConsumer { Throwable thrown = null; List<Metadata> metadataList = null; Metadata containerMetadata = fileResource.getMetadata(); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory, -1); try { - parse(fileResource.getResourceId(), parser, is, new DefaultHandler(), + parse(fileResource.getResourceId(), parser, is, handler, containerMetadata, context); - metadataList = parser.getMetadata(); + metadataList = handler.getMetadataList(); } catch (Throwable t) { thrown = t; - metadataList = parser.getMetadata(); + metadataList = handler.getMetadataList(); if (metadataList == null) { metadataList = new LinkedList<>(); } diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java index 4879af4..d55f3be 100644 --- a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java +++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java @@ -41,6 +41,7 @@ import org.apache.tika.batch.fs.FSOutputStreamFactory; import org.apache.tika.batch.fs.FSUtil; import org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer; import org.apache.tika.config.TikaConfig; +import org.apache.tika.parser.Parser; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.util.ClassLoaderUtil; @@ -129,17 +130,17 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder { OutputStreamFactory outputStreamFactory = getOutputStreamFactory( outputStreamFactoryNode, runtimeAttributes, contentHandlerFactory, recursiveParserWrapper); - + Parser parser = parserFactory.getParser(config); if (recursiveParserWrapper) { for (int i = 0; i < numConsumers; i++) { FileResourceConsumer c = new RecursiveParserWrapperFSConsumer(queue, - parserFactory, contentHandlerFactory, outputStreamFactory, config); + parser, contentHandlerFactory, outputStreamFactory); consumers.add(c); } } else { for (int i = 0; i < numConsumers; i++) { FileResourceConsumer c = new BasicTikaFSConsumer(queue, - parserFactory, contentHandlerFactory, outputStreamFactory, config); + parser, contentHandlerFactory, outputStreamFactory); consumers.add(c); } } diff --git a/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java b/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java index ee5b955..de416e8 100644 --- a/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java +++ b/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java @@ -35,7 +35,10 @@ import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.serialization.JsonMetadataList; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.BasicContentHandlerFactory; import org.junit.Test; @@ -69,9 +72,10 @@ public class RecursiveParserWrapperFSConsumerTest extends TikaTest { queue.add(new PoisonFileResource()); MockOSFactory mockOSFactory = new MockOSFactory(); + Parser p = new AutoDetectParserFactory().getParser(new TikaConfig()); RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer( - queue, new AutoDetectParserFactory(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), - mockOSFactory, new TikaConfig()); + queue, p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), + mockOSFactory); IFileProcessorFutureResult result = consumer.call(); mockOSFactory.getStreams().get(0).flush(); @@ -80,12 +84,12 @@ public class RecursiveParserWrapperFSConsumerTest extends TikaTest { assertEquals(4, results.size()); assertContains("another null pointer", - results.get(2).get(RecursiveParserWrapper.EMBEDDED_EXCEPTION)); + results.get(2).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_EXCEPTION)); assertEquals("Nikolai Lobachevsky", results.get(0).get("author")); for (int i = 1; i < 4; i++) { assertEquals("embeddedAuthor"+i, results.get(i).get("author")); - assertContains("some_embedded_content"+i, results.get(i).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("some_embedded_content"+i, results.get(i).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } } @@ -116,9 +120,10 @@ public class RecursiveParserWrapperFSConsumerTest extends TikaTest { queue.add(new PoisonFileResource()); MockOSFactory mockOSFactory = new MockOSFactory(); + Parser p = new AutoDetectParserFactory().getParser(new TikaConfig()); RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer( - queue, new AutoDetectParserFactory(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), - mockOSFactory, new TikaConfig()); + queue, p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), + mockOSFactory); IFileProcessorFutureResult result = consumer.call(); mockOSFactory.getStreams().get(0).flush(); @@ -129,7 +134,7 @@ public class RecursiveParserWrapperFSConsumerTest extends TikaTest { results.get(0).get(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime")); assertEquals("Nikolai Lobachevsky", results.get(0).get("author")); assertEquals("embeddedAuthor", results.get(1).get("author")); - assertContains("some_embedded_content", results.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("some_embedded_content", results.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java index ad0ff3f..c1fb815 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java @@ -17,24 +17,23 @@ package org.apache.tika.parser; * limitations under the License. */ -import java.io.IOException; -import java.io.InputStream; -import java.util.Date; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.FilenameUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.ContentHandlerFactory; +import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.utils.ParserUtils; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.Set; /** * This is a helper class that wraps a parser in a recursive handler. @@ -76,38 +75,93 @@ public class RecursiveParserWrapper extends ParserDecorator { */ private static final long serialVersionUID = 9086536568120690938L; - //move this to TikaCoreProperties? - public final static Property TIKA_CONTENT = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"content"); - public final static Property PARSE_TIME_MILLIS = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX + "parse_time_millis"); + /** + * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#TIKA_CONTENT} + */ + @Deprecated + public final static Property TIKA_CONTENT = AbstractRecursiveParserWrapperHandler.TIKA_CONTENT; + /** + * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#PARSE_TIME_MILLIS} + */ + @Deprecated + public final static Property PARSE_TIME_MILLIS = AbstractRecursiveParserWrapperHandler.PARSE_TIME_MILLIS; + + /** + * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_EXCEPTION} + */ + @Deprecated public final static Property WRITE_LIMIT_REACHED = - Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "write_limit_reached"); - public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED = - Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached"); + AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED; + /** + * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_RESOURCE_LIMIT_REACHED} + */ + @Deprecated + public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED = + AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED; - public final static Property EMBEDDED_EXCEPTION = ParserUtils.EMBEDDED_EXCEPTION; - //move this to TikaCoreProperties? - public final static Property EMBEDDED_RESOURCE_PATH = - Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"embedded_resource_path"); - - private final Parser wrappedParser; - private final ContentHandlerFactory contentHandlerFactory; - private final List<Metadata> metadatas = new LinkedList<>(); + /** + * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_EXCEPTION} + */ + @Deprecated + public final static Property EMBEDDED_EXCEPTION = AbstractRecursiveParserWrapperHandler.EMBEDDED_EXCEPTION; + + /** + * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_RESOURCE_PATH} + */ + @Deprecated + public final static Property EMBEDDED_RESOURCE_PATH = AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH; + + /** + * @deprecated this should be passed in via the {@link RecursiveParserWrapperHandler} + */ + @Deprecated + private ContentHandlerFactory contentHandlerFactory = null; private final boolean catchEmbeddedExceptions; - //used in naming embedded resources that don't have a name. - private int unknownCount = 0; + /** + * set this on the RecursiveParserWrapperHandler instead + * @deprecated this is here only for legacy behavior; it will be removed in 2.0 and/or 1.20 + */ + @Deprecated private int maxEmbeddedResources = -1; - private boolean hitMaxEmbeddedResources = false; + /** + * @deprecated this is here only for legacy behavior; it will be removed in 2.0 and/or 1.20 + */ + @Deprecated + private ParserState lastParseState = null; /** * Initialize the wrapper with {@link #catchEmbeddedExceptions} set * to <code>true</code> as default. * * @param wrappedParser parser to use for the container documents and the embedded documents + */ + public RecursiveParserWrapper(Parser wrappedParser) { + this(wrappedParser, true); + } + + /** + * + * @param wrappedParser parser to wrap + * @param catchEmbeddedExceptions whether or not to catch+record embedded exceptions. + * If set to <code>false</code>, embedded exceptions will be thrown and + * the rest of the file will not be parsed + */ + public RecursiveParserWrapper(Parser wrappedParser, boolean catchEmbeddedExceptions) { + super(wrappedParser); + this.catchEmbeddedExceptions = catchEmbeddedExceptions; + } + /** + * Initialize the wrapper with {@link #catchEmbeddedExceptions} set + * to <code>true</code> as default. + * + * @param wrappedParser parser to use for the container documents and the embedded documents * @param contentHandlerFactory factory to use to generate a new content handler for * the container document and each embedded document + * @deprecated use {@link RecursiveParserWrapper#RecursiveParserWrapper(Parser)} */ + @Deprecated public RecursiveParserWrapper(Parser wrappedParser, ContentHandlerFactory contentHandlerFactory) { this(wrappedParser, contentHandlerFactory, true); } @@ -121,18 +175,19 @@ public class RecursiveParserWrapper extends ParserDecorator { * @param catchEmbeddedExceptions whether or not to catch the embedded exceptions. * If set to <code>true</code>, the stack traces will be stored in * the metadata object with key: {@link #EMBEDDED_EXCEPTION}. + * @deprecated use {@link RecursiveParserWrapper#RecursiveParserWrapper(Parser, boolean)} */ + @Deprecated public RecursiveParserWrapper(Parser wrappedParser, ContentHandlerFactory contentHandlerFactory, boolean catchEmbeddedExceptions) { super(wrappedParser); - this.wrappedParser = wrappedParser; this.contentHandlerFactory = contentHandlerFactory; this.catchEmbeddedExceptions = catchEmbeddedExceptions; } @Override public Set<MediaType> getSupportedTypes(ParseContext context) { - return wrappedParser.getSupportedTypes(context); + return getWrappedParser().getSupportedTypes(context); } /** @@ -145,31 +200,34 @@ public class RecursiveParserWrapper extends ParserDecorator { * Make sure to call {@link #reset()} after each parse. */ @Override - public void parse(InputStream stream, ContentHandler ignore, + public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - - EmbeddedParserDecorator decorator = new EmbeddedParserDecorator("/"); + //this tracks the state of the parent parser, per call to #parse + //in future versions, we can remove lastParseState, and this will be thread-safe + ParserState parserState; + if (recursiveParserWrapperHandler instanceof AbstractRecursiveParserWrapperHandler) { + parserState = new ParserState((AbstractRecursiveParserWrapperHandler)recursiveParserWrapperHandler); + } else { + parserState = new ParserState(new RecursiveParserWrapperHandler(contentHandlerFactory, maxEmbeddedResources)); + lastParseState = parserState; + } + EmbeddedParserDecorator decorator = new EmbeddedParserDecorator(getWrappedParser(), "/", parserState); context.set(Parser.class, decorator); - ContentHandler localHandler = contentHandlerFactory.getNewContentHandler(); - long started = new Date().getTime(); + ContentHandler localHandler = parserState.recursiveParserWrapperHandler.getNewContentHandler(); + long started = System.currentTimeMillis(); try { - wrappedParser.parse(stream, localHandler, metadata, context); + getWrappedParser().parse(stream, localHandler, metadata, context); } catch (SAXException e) { boolean wlr = isWriteLimitReached(e); if (wlr == false) { throw e; } - metadata.set(WRITE_LIMIT_REACHED, "true"); + metadata.set(RecursiveParserWrapperHandler.WRITE_LIMIT_REACHED, "true"); } finally { - long elapsedMillis = new Date().getTime() - started; - metadata.set(PARSE_TIME_MILLIS, Long.toString(elapsedMillis)); - addContent(localHandler, metadata); - - if (hitMaxEmbeddedResources) { - metadata.set(EMBEDDED_RESOURCE_LIMIT_REACHED, "true"); - } - metadatas.add(0, ParserUtils.cloneMetadata(metadata)); + long elapsedMillis = System.currentTimeMillis() - started; + metadata.set(RecursiveParserWrapperHandler.PARSE_TIME_MILLIS, Long.toString(elapsedMillis)); + parserState.recursiveParserWrapperHandler.endDocument(localHandler, metadata); } } @@ -178,11 +236,20 @@ public class RecursiveParserWrapper extends ParserDecorator { * The first element in the returned list represents the * data from the outer container file. There is no guarantee * about the ordering of the list after that. - * + * + * @deprecated use a {@link RecursiveParserWrapperHandler} instead + * * @return list of Metadata objects that were gathered during the parse + * @throws IllegalStateException if you've used a {@link RecursiveParserWrapperHandler} in your last + * call to {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} */ + @Deprecated public List<Metadata> getMetadata() { - return metadatas; + if (lastParseState != null) { + return ((RecursiveParserWrapperHandler) lastParseState.recursiveParserWrapperHandler).getMetadataList(); + } else { + throw new IllegalStateException("This is deprecated; please use a RecursiveParserWrapperHandler instead"); + } } /** @@ -192,22 +259,29 @@ public class RecursiveParserWrapper extends ParserDecorator { * * <p> * If this value is < 0 (the default), the wrapper will store all Metadata. - * + * @deprecated set this on a {@link RecursiveParserWrapperHandler} * @param max maximum number of embedded resources to store */ + @Deprecated public void setMaxEmbeddedResources(int max) { maxEmbeddedResources = max; } /** - * This clears the metadata list and resets {@link #unknownCount} and - * {@link #hitMaxEmbeddedResources} + * This clears the last parser state (metadata list, unknown count, hit embeddedresource count) + * + * @deprecated use a {@link org.apache.tika.sax.RecursiveParserWrapperHandler} instead + * @throws IllegalStateException if you used a {@link RecursiveParserWrapper} in your call + * to {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} */ + @Deprecated public void reset() { - metadatas.clear(); - unknownCount = 0; - hitMaxEmbeddedResources = false; + if (lastParseState != null) { + lastParseState = new ParserState(new RecursiveParserWrapperHandler(contentHandlerFactory, maxEmbeddedResources)); + } else { + throw new IllegalStateException("This is deprecated; please use a RecursiveParserWrapperHandler instead"); + } } /** @@ -225,35 +299,20 @@ public class RecursiveParserWrapper extends ParserDecorator { return t.getCause() != null && isWriteLimitReached(t.getCause()); } } - - private String getResourceName(Metadata metadata) { + + private String getResourceName(Metadata metadata, ParserState state) { String objectName = ""; if (metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY) != null) { objectName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); - } else if (metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID) != null) { + } else if (metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID) != null) { objectName = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID); - } else { - objectName = "embedded-" + (++unknownCount); - } - //make sure that there isn't any path info in the objectName - //some parsers can return paths, not just file names - objectName = FilenameUtils.getName(objectName); - return objectName; - } - - private void addContent(ContentHandler handler, Metadata metadata) { - - if (handler.getClass().equals(DefaultHandler.class)){ - //no-op: we can't rely on just testing for - //empty content because DefaultHandler's toString() - //returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd" } else { - String content = handler.toString(); - if (content != null && content.trim().length() > 0 ) { - metadata.add(TIKA_CONTENT, content); - } + objectName = "embedded-" + (++state.unknownCount); } - + //make sure that there isn't any path info in the objectName + //some parsers can return paths, not just file names + objectName = FilenameUtils.getName(objectName); + return objectName; } @@ -262,14 +321,16 @@ public class RecursiveParserWrapper extends ParserDecorator { private static final long serialVersionUID = 207648200464263337L; private String location = null; + private final ParserState parserState; - private EmbeddedParserDecorator(String location) { - super(wrappedParser); + private EmbeddedParserDecorator(Parser parser, String location, ParserState parseState) { + super(parser); this.location = location; if (! this.location.endsWith("/")) { this.location += "/"; } + this.parserState = parseState; } @Override @@ -277,24 +338,23 @@ public class RecursiveParserWrapper extends ParserDecorator { Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { //Test to see if we should avoid parsing - if (maxEmbeddedResources > -1 && - metadatas.size() >= maxEmbeddedResources) { - hitMaxEmbeddedResources = true; + if (parserState.recursiveParserWrapperHandler.hasHitMaximumEmbeddedResources()) { return; } // Work out what this thing is - String objectName = getResourceName(metadata); + String objectName = getResourceName(metadata, parserState); String objectLocation = this.location + objectName; - metadata.add(EMBEDDED_RESOURCE_PATH, objectLocation); + metadata.add(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, objectLocation); + + + //get a fresh handler + ContentHandler localHandler = parserState.recursiveParserWrapperHandler.getNewContentHandler(); + parserState.recursiveParserWrapperHandler.startEmbeddedDocument(localHandler, metadata); - //ignore the content handler that is passed in - //and get a fresh handler - ContentHandler localHandler = contentHandlerFactory.getNewContentHandler(); - Parser preContextParser = context.get(Parser.class); - context.set(Parser.class, new EmbeddedParserDecorator(objectLocation)); - long started = new Date().getTime(); + context.set(Parser.class, new EmbeddedParserDecorator(getWrappedParser(), objectLocation, parserState)); + long started = System.currentTimeMillis(); try { super.parse(stream, localHandler, metadata, context); } catch (SAXException e) { @@ -316,20 +376,31 @@ public class RecursiveParserWrapper extends ParserDecorator { } } finally { context.set(Parser.class, preContextParser); - long elapsedMillis = new Date().getTime() - started; + long elapsedMillis = System.currentTimeMillis() - started; metadata.set(PARSE_TIME_MILLIS, Long.toString(elapsedMillis)); } //Because of recursion, we need //to re-test to make sure that we limit the //number of stored resources - if (maxEmbeddedResources > -1 && - metadatas.size() >= maxEmbeddedResources) { - hitMaxEmbeddedResources = true; + if (parserState.recursiveParserWrapperHandler.hasHitMaximumEmbeddedResources()) { return; } - addContent(localHandler, metadata); - metadatas.add(ParserUtils.cloneMetadata(metadata)); - } + parserState.recursiveParserWrapperHandler.endEmbeddedDocument(localHandler, metadata); + } + } + + /** + * This tracks the state of the parse of a single document. + * In future versions, this will allow the RecursiveParserWrapper to be thread safe. + */ + private class ParserState { + private int unknownCount = 0; + private final AbstractRecursiveParserWrapperHandler recursiveParserWrapperHandler; + private ParserState(AbstractRecursiveParserWrapperHandler handler) { + this.recursiveParserWrapperHandler = handler; + } + + } } diff --git a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java new file mode 100644 index 0000000..58f9ec6 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.utils.ParserUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import java.io.OutputStream; +import java.nio.charset.Charset; + +/** + * This is a special handler to be used only with the {@link org.apache.tika.parser.RecursiveParserWrapper}. + * It allows for finer-grained processing of embedded documents than in the legacy handlers. + * Subclasses can choose how to process individual embedded documents. + */ +public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandler { + + public final static Property TIKA_CONTENT = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"content"); + public final static Property PARSE_TIME_MILLIS = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX + "parse_time_millis"); + public final static Property WRITE_LIMIT_REACHED = + Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "write_limit_reached"); + public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED = + Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached"); + + public final static Property EMBEDDED_EXCEPTION = ParserUtils.EMBEDDED_EXCEPTION; + + public final static Property EMBEDDED_RESOURCE_PATH = + Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"embedded_resource_path"); + + private final ContentHandlerFactory contentHandlerFactory; + private final int maxEmbeddedResources; + private int embeddedResources = 0; + + public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) { + this(contentHandlerFactory, -1); + } + + public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) { + this.contentHandlerFactory = contentHandlerFactory; + this.maxEmbeddedResources = maxEmbeddedResources; + } + + public ContentHandler getNewContentHandler() { + return contentHandlerFactory.getNewContentHandler(); + } + + public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { + return contentHandlerFactory.getNewContentHandler(os, charset); + } + + /** + * This is called before parsing each embedded document. Override this + * for custom behavior. Make sure to call this in your custom classes + * because this tracks the number of embedded documents. + * + * @param contentHandler local handler to be used on this embedded document + * @param metadata embedded document's metadata + */ + public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { + embeddedResources++; + } + /** + * This is called after parsing each embedded document. Override this + * for custom behavior. This is currently a no-op. + * + * @param contentHandler content handler that was used on this embedded document + * @param metadata metadata for this embedded document + * @throws SAXException + */ + public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { + } + + /** + * This is called after the full parse has completed. Override this + * for custom behavior. Make sure to call this as <code>super.endDocument(...)</code> + * in subclasses because this adds whether or not the embedded resource + * maximum has been hit to the metadata. + * + * @param contentHandler content handler that was used on the main document + * @param metadata metadata that was gathered for the main document + * @throws SAXException + */ + public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { + if (hasHitMaximumEmbeddedResources()) { + metadata.set(EMBEDDED_RESOURCE_LIMIT_REACHED, "true"); + } + } + + /** + * + * @return whether this handler has hit the maximum embedded resources during the parse + */ + public boolean hasHitMaximumEmbeddedResources() { + if (maxEmbeddedResources > -1 && embeddedResources > maxEmbeddedResources) { + return true; + } + return false; + } + +} diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java index c611f09..899994e 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java @@ -19,6 +19,7 @@ package org.apache.tika.sax; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; import java.util.Locale; import org.xml.sax.ContentHandler; @@ -116,40 +117,48 @@ public class BasicContentHandlerFactory implements ContentHandlerFactory { @Override public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException { + return getNewContentHandler(os, Charset.forName(encoding)); + } + + @Override + public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { if (type == HANDLER_TYPE.IGNORE) { return new DefaultHandler(); } - - if (writeLimit > -1) { - switch(type) { - case BODY: - return new WriteOutContentHandler( - new BodyContentHandler( - new OutputStreamWriter(os, encoding)), writeLimit); - case TEXT: - return new WriteOutContentHandler(new ToTextContentHandler(os, encoding), writeLimit); - case HTML: - return new WriteOutContentHandler(new ToHTMLContentHandler(os, encoding), writeLimit); - case XML: - return new WriteOutContentHandler(new ToXMLContentHandler(os, encoding), writeLimit); - default: - return new WriteOutContentHandler(new ToTextContentHandler(os, encoding), writeLimit); - } - } else { - switch (type) { - case BODY: - return new BodyContentHandler(new OutputStreamWriter(os, encoding)); - case TEXT: - return new ToTextContentHandler(os, encoding); - case HTML: - return new ToHTMLContentHandler(os, encoding); - case XML: - return new ToXMLContentHandler(os, encoding); - default: - return new ToTextContentHandler(os, encoding); - + try { + if (writeLimit > -1) { + switch (type) { + case BODY: + return new WriteOutContentHandler( + new BodyContentHandler( + new OutputStreamWriter(os, charset)), writeLimit); + case TEXT: + return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit); + case HTML: + return new WriteOutContentHandler(new ToHTMLContentHandler(os, charset.name()), writeLimit); + case XML: + return new WriteOutContentHandler(new ToXMLContentHandler(os, charset.name()), writeLimit); + default: + return new WriteOutContentHandler(new ToTextContentHandler(os, charset.name()), writeLimit); + } + } else { + switch (type) { + case BODY: + return new BodyContentHandler(new OutputStreamWriter(os, charset)); + case TEXT: + return new ToTextContentHandler(os, charset.name()); + case HTML: + return new ToHTMLContentHandler(os, charset.name()); + case XML: + return new ToXMLContentHandler(os, charset.name()); + default: + return new ToTextContentHandler(os, charset.name()); + + } } + } catch (UnsupportedEncodingException e) { + throw new RuntimeException("couldn't find charset for name: "+charset); } } diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java index c69b980..9dd74c4 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java @@ -21,12 +21,18 @@ import org.xml.sax.ContentHandler; import java.io.OutputStream; import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; /** * Interface to allow easier injection of code for getting a new ContentHandler */ public interface ContentHandlerFactory { public ContentHandler getNewContentHandler(); + /** + * @deprecated use {@link #getNewContentHandler(OutputStream, Charset)} + */ + @Deprecated public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException; + public ContentHandler getNewContentHandler(OutputStream os, Charset charset); } diff --git a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java new file mode 100644 index 0000000..2444a9c --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.utils.ParserUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import java.util.LinkedList; +import java.util.List; + +/** + * This is the default implementation of {@link AbstractRecursiveParserWrapperHandler}. + * See its documentation for more details. + * + * This caches the a metadata object for each embedded file and for the container file. + * It places the extracted content in the metadata object, with this key: {@link AbstractRecursiveParserWrapperHandler#TIKA_CONTENT} + * If memory is a concern, subclass AbstractRecursiveParserWrapperHandler to handle each + * embedded document. + * <p> + * <b>NOTE: This handler must only be used with the {@link org.apache.tika.parser.RecursiveParserWrapper}</b> + * </p> + */ +public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrapperHandler { + + private final List<Metadata> metadataList = new LinkedList<>(); + + /** + * Create a handler with no limit on the number of embedded resources + */ + public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) { + super(contentHandlerFactory); + } + + /** + * Create a handler that limits the number of embedded resources that will be + * parsed + * @param maxEmbeddedResources number of embedded resources that will be parsed + */ + public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) { + super(contentHandlerFactory, maxEmbeddedResources); + } + + /** + * This is called before parsing an embedded document + * + * @param contentHandler - local content handler to use on the embedded document + * @param metadata metadata to use for the embedded document + * @throws SAXException + */ + @Override + public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { + super.startEmbeddedDocument(contentHandler, metadata); + } + + /** + * This is called after parsing an embedded document. + * @param contentHandler local contenthandler used on the embedded document + * @param metadata metadata from the embedded document + * @throws SAXException + */ + @Override + public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { + super.endEmbeddedDocument(contentHandler, metadata); + addContent(contentHandler, metadata); + metadataList.add(ParserUtils.cloneMetadata(metadata)); + } + + /** + * + * @param contentHandler content handler used on the main document + * @param metadata metadata from the main document + * @throws SAXException + */ + @Override + public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { + super.endDocument(contentHandler, metadata); + addContent(contentHandler, metadata); + + metadataList.add(0, ParserUtils.cloneMetadata(metadata)); + } + + /** + * + * @return a list of Metadata objects, one for the main document and one for each embedded document + */ + public List<Metadata> getMetadataList() { + return metadataList; + } + + private void addContent(ContentHandler handler, Metadata metadata) { + + if (handler.getClass().equals(DefaultHandler.class)){ + //no-op: we can't rely on just testing for + //empty content because DefaultHandler's toString() + //returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd" + } else { + String content = handler.toString(); + if (content != null && content.trim().length() > 0 ) { + metadata.add(TIKA_CONTENT, content); + } + } + } +} diff --git a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java index c0ec98c..bd0f263 100644 --- a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java @@ -22,7 +22,9 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.junit.Test; import org.xml.sax.helpers.DefaultHandler; @@ -136,8 +138,10 @@ public class MultiThreadedTikaTest extends TikaTest { //use the same parser in all threads Parser parser = new AutoDetectParser(); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); for (int i = 0; i < numThreads; i++) { - executorCompletionService.submit(new TikaRunner(parser, numIterations, testFiles, truth)); + executorCompletionService.submit(new TikaRunner(wrapper, numIterations, testFiles, truth)); } int completed = 0; @@ -184,12 +188,14 @@ public class MultiThreadedTikaTest extends TikaTest { try { Parser p = new AutoDetectParser(); - RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), + -1); try (InputStream is = Files.newInputStream(f)) { - wrapper.parse(is, new DefaultHandler(), new Metadata(), new ParseContext()); + wrapper.parse(is, handler, new Metadata(), new ParseContext()); } - List<Metadata> metadataList = wrapper.getMetadata(); + List<Metadata> metadataList = handler.getMetadataList(); baseline.put(f, new Extract(metadataList)); } catch (Exception e) { //swallow @@ -198,26 +204,28 @@ public class MultiThreadedTikaTest extends TikaTest { return baseline; } - private static List<Metadata> getRecursiveMetadata(InputStream is, Parser p) throws Exception { + private static List<Metadata> getRecursiveMetadata(InputStream is, RecursiveParserWrapper wrapper) throws Exception { //different from parent TikaTest in that this extracts text. //can't extract xhtml because "tmp" file names wind up in //content's metadata and they'll differ by file. - RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); - wrapper.parse(is, new DefaultHandler(), new Metadata(), new ParseContext()); - return wrapper.getMetadata(); + + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), + -1); + wrapper.parse(is, handler, new Metadata(), new ParseContext()); + return handler.getMetadataList(); } //TODO: make this return something useful besides an integer private class TikaRunner implements Callable<Integer> { - private final Parser parser; + private final RecursiveParserWrapper parser; private final int iterations; private final Path[] files; private final Map<Path, Extract> truth; private final Random random = new Random(); - private TikaRunner(Parser parser, int iterations, Path[] files, Map<Path, Extract> truth) { + private TikaRunner(RecursiveParserWrapper parser, int iterations, Path[] files, Map<Path, Extract> truth) { this.parser = parser; this.iterations = iterations; this.files = files; @@ -253,8 +261,8 @@ public class MultiThreadedTikaTest extends TikaTest { extractA.metadataList.get(i).size(), extractB.metadataList.get(i).size()); assertEquals("content in attachment: " + i, - extractA.metadataList.get(i).get(RecursiveParserWrapper.TIKA_CONTENT), - extractB.metadataList.get(i).get(RecursiveParserWrapper.TIKA_CONTENT)); + extractA.metadataList.get(i).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT), + extractB.metadataList.get(i).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } } diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index 59a002a..aae63e0 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -45,6 +45,7 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.sax.ToXMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.helpers.DefaultHandler; @@ -220,40 +221,47 @@ public abstract class TikaTest { protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception { Parser p = new AutoDetectParser(); - RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { - wrapper.parse(is, new DefaultHandler(), metadata, context); + wrapper.parse(is, handler, metadata, context); } - return wrapper.getMetadata(); + return handler.getMetadataList(); } protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception { Parser p = new AutoDetectParser(); - RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); + + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { - wrapper.parse(is, new DefaultHandler(), new Metadata(), context); + wrapper.parse(is, handler, new Metadata(), context); } - return wrapper.getMetadata(); + return handler.getMetadataList(); } protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap) throws Exception { - RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap, + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { - wrapper.parse(is, new DefaultHandler(), new Metadata(), new ParseContext()); + wrapper.parse(is, handler, new Metadata(), new ParseContext()); } - return wrapper.getMetadata(); + return handler.getMetadataList(); } protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap, ParseContext parseContext) throws Exception { - RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap, + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { - wrapper.parse(is, new DefaultHandler(), new Metadata(), parseContext); + wrapper.parse(is, handler, new Metadata(), parseContext); } - return wrapper.getMetadata(); + return handler.getMetadataList(); } diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java index 5029ecf..0a67ad0 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java @@ -61,6 +61,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.PagedText; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.utils.ExceptionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -254,7 +255,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer { data.put(Cols.FILE_NAME, fps.getRelativeSourceFilePath().getFileName().toString()); } else { data.put(Cols.IS_EMBEDDED, TRUE); - data.put(Cols.FILE_NAME, getFileName(m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH))); + data.put(Cols.FILE_NAME, getFileName(m.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH))); } String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME)); ext = (ext == null) ? "" : ext.toLowerCase(Locale.US); @@ -391,7 +392,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer { String getTime(Metadata m) { String elapsed = "-1"; - String v = m.get(RecursiveParserWrapper.PARSE_TIME_MILLIS); + String v = m.get(AbstractRecursiveParserWrapperHandler.PARSE_TIME_MILLIS); if (v != null) { return v; } @@ -414,7 +415,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer { String fullTrace = metadata.get(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime"); if (fullTrace == null) { - fullTrace = metadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION); + fullTrace = metadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_EXCEPTION); } if (fullTrace != null) { @@ -475,7 +476,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer { if (metadata == null) { return ""; } - String c = metadata.get(RecursiveParserWrapper.TIKA_CONTENT); + String c = metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); if (c == null) { return ""; } @@ -723,7 +724,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer { Map<String, Integer> counts = new HashMap<>(); for (int i = 1; i < list.size(); i++) { - String path = list.get(i).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH); + String path = list.get(i).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH); if (path == null) { //shouldn't ever happen continue; @@ -745,7 +746,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer { } for (int i = 1; i < list.size(); i++) { - Integer count = counts.get(list.get(i).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); + Integer count = counts.get(list.get(i).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)); if (count == null) { count = 0; } diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java index f142c5b..1ff5f0b 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java @@ -45,6 +45,7 @@ import org.apache.tika.eval.tokens.TokenIntPair; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; public class ExtractComparer extends AbstractProfiler { @@ -352,10 +353,10 @@ public class ExtractComparer extends AbstractProfiler { String pathA = null; String pathB = null; if (mA != null) { - pathA = mA.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH); + pathA = mA.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH); } if (mB != null) { - pathB = mB.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH); + pathB = mB.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH); } if (pathA != null) { Map<Cols, String> d = new HashMap<>(); @@ -390,7 +391,7 @@ public class ExtractComparer extends AbstractProfiler { /** - * Try to find the matching metadata based on the RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH + * Try to find the matching metadata based on the AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH * If you can't find it, return -1; * * @param i index for match in metadataListA @@ -419,11 +420,11 @@ public class ExtractComparer extends AbstractProfiler { //assume same embedded resource path. Not always true! Metadata thisMetadata = metadataListA.get(i); - String embeddedPath = thisMetadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH); + String embeddedPath = thisMetadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH); if (embeddedPath != null) { for (int j = 0; j < metadataListB.size(); j++) { String thatEmbeddedPath = metadataListB.get(j).get( - RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH); + AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH); if (embeddedPath.equals(thatEmbeddedPath)) { return j; } diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java index d5f9af3..200bf33 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java @@ -37,6 +37,7 @@ import org.apache.tika.eval.io.ExtractReaderException; import org.apache.tika.eval.io.IDBWriter; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; public class ExtractProfiler extends AbstractProfiler { @@ -247,7 +248,7 @@ public class ExtractProfiler extends AbstractProfiler { Map<Cols, String> data = new HashMap<>(); data.put(Cols.ID, fileId); data.put(Cols.EMBEDDED_FILE_PATH, - m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); + m.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)); try { writer.writeRow(embeddedFilePathTable, data); } catch (IOException e) { diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java index 7bafa97..d406919 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java @@ -24,6 +24,7 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.serialization.JsonMetadataList; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -149,13 +150,13 @@ public class ExtractReader { Metadata containerMetadata = metadataList.get(0); for (int i = 0; i < metadataList.size(); i++) { Metadata m = metadataList.get(i); - String c = m.get(RecursiveParserWrapper.TIKA_CONTENT); + String c = m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); if (c != null) { sb.append(c); sb.append(" "); } } - containerMetadata.set(RecursiveParserWrapper.TIKA_CONTENT, sb.toString()); + containerMetadata.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, sb.toString()); while (metadataList.size() > 1) { metadataList.remove(metadataList.size()-1); } @@ -179,7 +180,7 @@ public class ExtractReader { List<Metadata> metadataList = new ArrayList<>(); String content = IOUtils.toString(reader); Metadata m = new Metadata(); - m.set(RecursiveParserWrapper.TIKA_CONTENT, content); + m.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, content); //Let's hope the file name has a suffix that can //be used to determine the mime. Could be wrong or missing, //but better than nothing. diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java index ea516c6..de09fa1 100644 --- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java +++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java @@ -41,6 +41,7 @@ import org.apache.tika.eval.io.ExtractReaderException; import org.apache.tika.eval.util.LanguageIDWrapper; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; @@ -184,7 +185,7 @@ public class SimpleComparerTest extends TikaTest { @Test public void testGetContent() throws Exception { Metadata m = new Metadata(); - m.add(RecursiveParserWrapper.TIKA_CONTENT, "0123456789"); + m.add(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, "0123456789"); Map<Cols, String> data = new HashMap<>(); String content = getContent(m, 10, data); assertEquals(10, content.length()); @@ -233,23 +234,23 @@ public class SimpleComparerTest extends TikaTest { public void testAttachmentCounts() { List<Metadata> list = new ArrayList<>(); Metadata m0 = new Metadata(); - m0.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "dir1/dir2/file.zip");//bad data should be ignored + m0.set(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, "dir1/dir2/file.zip");//bad data should be ignored //in the first metadata object list.add(m0); Metadata m1 = new Metadata(); - m1.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip/text1.txt"); + m1.set(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip/text1.txt"); list.add(m1); Metadata m2 = new Metadata(); - m2.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip/text2.txt"); + m2.set(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip/text2.txt"); list.add(m2); Metadata m3 = new Metadata(); - m3.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip"); + m3.set(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip"); list.add(m3); Metadata m4 = new Metadata(); - m4.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx"); + m4.set(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, "/f1.docx"); list.add(m4); Metadata m5 = new Metadata(); - m5.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/text3.txt"); + m5.set(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, "/f1.docx/text3.txt"); list.add(m5); List<Integer> counts = AbstractProfiler.countAttachments(list); diff --git a/tika-eval/src/test/java/org/apache/tika/eval/io/ExtractReaderTest.java b/tika-eval/src/test/java/org/apache/tika/eval/io/ExtractReaderTest.java index f22179a..47d5934 100644 --- a/tika-eval/src/test/java/org/apache/tika/eval/io/ExtractReaderTest.java +++ b/tika-eval/src/test/java/org/apache/tika/eval/io/ExtractReaderTest.java @@ -26,6 +26,7 @@ import java.util.List; import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.junit.Before; import org.junit.Test; @@ -47,24 +48,24 @@ public class ExtractReaderTest extends TikaTest { List<Metadata> metadataList = extractReader.loadExtract(testJsonFile); assertEquals(2, metadataList.size()); - assertEquals(1, metadataList.get(0).getValues(RecursiveParserWrapper.TIKA_CONTENT).length); - assertEquals(1, metadataList.get(1).getValues(RecursiveParserWrapper.TIKA_CONTENT).length); - assertContains("fox", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); - assertContains("attachment", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertEquals(1, metadataList.get(0).getValues(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length); + assertEquals(1, metadataList.get(1).getValues(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length); + assertContains("fox", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); + assertContains("attachment", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); extractReader = new ExtractReader(ExtractReader.ALTER_METADATA_LIST.FIRST_ONLY); metadataList = extractReader.loadExtract(testJsonFile); assertEquals(1, metadataList.size()); - assertEquals(1, metadataList.get(0).getValues(RecursiveParserWrapper.TIKA_CONTENT).length); - assertContains("fox", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); - assertNotContained("attachment", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertEquals(1, metadataList.get(0).getValues(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length); + assertContains("fox", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); + assertNotContained("attachment", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); extractReader = new ExtractReader(ExtractReader.ALTER_METADATA_LIST.CONCATENATE_CONTENT_INTO_FIRST); metadataList = extractReader.loadExtract(testJsonFile); assertEquals(1, metadataList.size()); - assertEquals(1, metadataList.get(0).getValues(RecursiveParserWrapper.TIKA_CONTENT).length); - assertContains("fox", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); - assertContains("attachment", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertEquals(1, metadataList.get(0).getValues(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length); + assertContains("fox", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); + assertContains("attachment", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } @Test @@ -73,9 +74,9 @@ public class ExtractReaderTest extends TikaTest { List<Metadata> metadataList = extractReader.loadExtract(testTxtFile); assertEquals(1, metadataList.size()); Metadata m = metadataList.get(0); - assertEquals(1, m.getValues(RecursiveParserWrapper.TIKA_CONTENT).length); + assertEquals(1, m.getValues(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length); assertEquals("the quick brown fox fox fox jumped over the lazy lazy dog\n", - m.get(RecursiveParserWrapper.TIKA_CONTENT)); + m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); //test that the mime is inferred from the file extension assertEquals("application/msword", m.get(Metadata.CONTENT_TYPE)); diff --git a/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java b/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java index a5d2c3b..d85c2af 100644 --- a/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java @@ -40,6 +40,7 @@ import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerFactory; +import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; @@ -164,15 +165,16 @@ public class ParsingExample { ContentHandlerFactory factory = new BasicContentHandlerFactory( BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1); - RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, factory); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test_recursive_embedded.docx"); ParseContext context = new ParseContext(); - + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(factory, -1); try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) { - wrapper.parse(stream, new DefaultHandler(), metadata, context); + wrapper.parse(stream, handler, metadata, context); } - return wrapper.getMetadata(); + + return handler.getMetadataList(); } /** diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java index b1a0caa..1044298 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java @@ -170,6 +170,11 @@ public class PickBestTextEncodingParser extends AbstractMultipleParser { String encoding) throws UnsupportedEncodingException { return getNewContentHandler(); } + + @Override + public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { + return getNewContentHandler(); + } } protected class CharsetTester { diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java index b1b72ca..5c2e11c 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java @@ -19,6 +19,7 @@ package org.apache.tika.parser; import static org.apache.tika.TikaTest.assertContains; +import static org.apache.tika.TikaTest.debug; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; @@ -34,8 +35,10 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.utils.CommonsDigester; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.ContentHandlerFactory; +import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.utils.ParserUtils; import org.junit.Test; import org.xml.sax.helpers.DefaultHandler; @@ -47,7 +50,7 @@ public class RecursiveParserWrapperTest { List<Metadata> list = getMetadata(new Metadata(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); Metadata container = list.get(0); - String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); + String content = container.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); //not much differentiates html from xml in this test file assertTrue(content.indexOf("<p class=\"header\" />") > -1); } @@ -57,7 +60,7 @@ public class RecursiveParserWrapperTest { List<Metadata> list = getMetadata(new Metadata(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1)); Metadata container = list.get(0); - String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); + String content = container.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); //not much differentiates html from xml in this test file assertTrue(content.indexOf("<p class=\"header\"></p>") > -1); } @@ -67,7 +70,7 @@ public class RecursiveParserWrapperTest { List<Metadata> list = getMetadata(new Metadata(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); Metadata container = list.get(0); - String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); + String content = container.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); assertTrue(content.indexOf("<p ") < 0); assertTrue(content.indexOf("embed_0") > -1); } @@ -77,7 +80,7 @@ public class RecursiveParserWrapperTest { List<Metadata> list = getMetadata(new Metadata(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); Metadata container = list.get(0); - String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); + String content = container.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); assertNull(content); } @@ -88,18 +91,19 @@ public class RecursiveParserWrapperTest { Metadata metadata = new Metadata(); Parser wrapped = new AutoDetectParser(); - RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60)); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped); InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream( "/test-documents/test_recursive_embedded.docx"); - wrapper.parse(stream, new DefaultHandler(), metadata, context); - List<Metadata> list = wrapper.getMetadata(); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60)); + wrapper.parse(stream, handler, metadata, context); + List<Metadata> list = handler.getMetadataList(); assertEquals(5, list.size()); int wlr = 0; for (Metadata m : list) { - String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED); + String limitReached = m.get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED); if (limitReached != null && limitReached.equals("true")) { wlr++; } @@ -108,8 +112,12 @@ public class RecursiveParserWrapperTest { } + /** + * @deprecated this will be removed in 1.20 or 2.0 + * @throws Exception + */ @Test - public void testMaxEmbedded() throws Exception { + public void testMaxEmbeddedLegacy() throws Exception { int maxEmbedded = 4; int totalNoLimit = 12;//including outer container file ParseContext context = new ParseContext(); @@ -127,7 +135,7 @@ public class RecursiveParserWrapperTest { //test default assertEquals(totalNoLimit, list.size()); - limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED); + limitReached = list.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED); assertNull(limitReached); @@ -143,9 +151,9 @@ public class RecursiveParserWrapperTest { list = wrapper.getMetadata(); //add 1 for outer container file - assertEquals(maxEmbedded + 1, list.size()); + assertEquals(maxEmbedded, list.size()); - limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED); + limitReached = list.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED); assertEquals("true", limitReached); wrapper.reset(); @@ -158,11 +166,68 @@ public class RecursiveParserWrapperTest { wrapper.setMaxEmbeddedResources(-2); wrapper.parse(stream, new DefaultHandler(), metadata, context); + assertEquals(totalNoLimit, wrapper.getMetadata().size()); + limitReached = wrapper.getMetadata().get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED); + assertNull(limitReached); + } + + @Test + public void testMaxEmbedded() throws Exception { + int maxEmbedded = 4; + int totalNoLimit = 12;//including outer container file + ParseContext context = new ParseContext(); + Metadata metadata = new Metadata(); + String limitReached = null; + + Parser wrapped = new AutoDetectParser(); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped); + + InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream( + "/test-documents/test_recursive_embedded.docx"); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( + + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,-1)); + wrapper.parse(stream, handler, metadata, context); + List<Metadata> list = handler.getMetadataList(); + //test default assertEquals(totalNoLimit, list.size()); - limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED); + + limitReached = list.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED); + assertNull(limitReached); + + stream.close(); + + //test setting value + metadata = new Metadata(); + stream = RecursiveParserWrapperTest.class.getResourceAsStream( + "/test-documents/test_recursive_embedded.docx"); + handler = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), maxEmbedded); + wrapper.parse(stream, handler, metadata, context); + list = handler.getMetadataList(); + + //add 1 for outer container file + assertEquals(maxEmbedded, list.size()); + + limitReached = list.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED); + assertEquals("true", limitReached); + + stream.close(); + + //test setting value < 0 + metadata = new Metadata(); + stream = RecursiveParserWrapperTest.class.getResourceAsStream( + "/test-documents/test_recursive_embedded.docx"); + handler = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,-1), -2); + wrapper.parse(stream, handler, metadata, context); + list = handler.getMetadataList(); + assertEquals(totalNoLimit, list.size()); + limitReached = list.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED); assertNull(limitReached); } + @Test public void testEmbeddedResourcePath() throws Exception { @@ -184,12 +249,12 @@ public class RecursiveParserWrapperTest { List<Metadata> list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); Metadata container = list.get(0); - String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); + String content = container.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); assertTrue(content.indexOf("<p class=\"header\" />") > -1); Set<String> seen = new HashSet<String>(); for (Metadata m : list) { - String path = m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH); + String path = m.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH); if (path != null) { seen.add(path); } @@ -231,8 +296,10 @@ public class RecursiveParserWrapperTest { ParseContext context = new ParseContext(); Parser wrapped = new AutoDetectParser(); - RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, true); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); + String path = "/test-documents/mock/embedded_then_npe.xml"; InputStream stream = null; @@ -240,7 +307,7 @@ public class RecursiveParserWrapperTest { try { stream = RecursiveParserWrapperTest.class.getResourceAsStream( path); - wrapper.parse(stream, new DefaultHandler(), metadata, context); + wrapper.parse(stream, handler, metadata, context); } catch (TikaException e) { if (e.getCause().getClass().equals(NullPointerException.class)) { npe = true; @@ -250,15 +317,15 @@ public class RecursiveParserWrapperTest { } assertTrue("npe", npe); - List<Metadata> metadataList = wrapper.getMetadata(); + List<Metadata> metadataList = handler.getMetadataList(); assertEquals(2, metadataList.size()); Metadata outerMetadata = metadataList.get(0); Metadata embeddedMetadata = metadataList.get(1); - assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("main_content", outerMetadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaCoreProperties.RESOURCE_NAME_KEY)); assertEquals("Nikolai Lobachevsky", outerMetadata.get("author")); - assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("some_embedded_content", embeddedMetadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); assertEquals("embed1.xml", embeddedMetadata.get(TikaCoreProperties.RESOURCE_NAME_KEY)); assertEquals("embeddedAuthor", embeddedMetadata.get("author")); } @@ -269,7 +336,7 @@ public class RecursiveParserWrapperTest { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test_recursive_embedded.docx"); List<Metadata> list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), - true, new CommonsDigester(100000, CommonsDigester.DigestAlgorithm.MD5)); + true, new CommonsDigester(100000, "md5")); int i = 0; Metadata m0 = list.get(0); Metadata m6 = list.get(6); @@ -287,8 +354,7 @@ public class RecursiveParserWrapperTest { if (digester != null) { wrapped = new DigestingParser(wrapped, digester); } - RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, - contentHandlerFactory, catchEmbeddedExceptions); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, catchEmbeddedExceptions); String path = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); if (path == null) { path = "/test-documents/test_recursive_embedded.docx"; @@ -296,13 +362,14 @@ public class RecursiveParserWrapperTest { path = "/test-documents/" + path; } InputStream stream = null; + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory); try { stream = TikaInputStream.get(RecursiveParserWrapperTest.class.getResource(path).toURI()); - wrapper.parse(stream, new DefaultHandler(), metadata, context); + wrapper.parse(stream, handler, metadata, context); } finally { IOUtils.closeQuietly(stream); } - return wrapper.getMetadata(); + return handler.getMetadataList(); } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index 532abed..def25d1 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -71,6 +71,7 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.LinkContentHandler; import org.apache.tika.sax.TeeContentHandler; @@ -1249,8 +1250,8 @@ public class HtmlParserTest extends TikaTest { assertEquals(2, metadataList.size()); assertEquals("MACRO", metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); assertContains("cool", - metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); - assertNotContained("cool", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); + assertNotContained("cool", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } @Test @@ -1263,8 +1264,8 @@ public class HtmlParserTest extends TikaTest { assertEquals(2, metadataList.size()); assertEquals("MACRO", metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); assertContains("cool", - metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); - assertNotContained("cool", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); + assertNotContained("cool", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } @@ -1272,7 +1273,7 @@ public class HtmlParserTest extends TikaTest { public void testDataURI() throws Exception { List<Metadata> metadataList = getRecursiveMetadata("testHTML_embedded_img.html"); assertEquals(2, metadataList.size()); - String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); + String content = metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); assertContains("some content", content); //make sure that you've truncated the data: value assertContains("src=\"data:\"", content); @@ -1290,7 +1291,7 @@ public class HtmlParserTest extends TikaTest { Parser p = new AutoDetectParser(tikaConfig); List<Metadata> metadataList = getRecursiveMetadata("testHTML_embedded_img_in_js.html", p); assertEquals(3, metadataList.size()); - String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); + String content = metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); assertContains("some content", content); Metadata imgMetadata = metadataList.get(1); assertEquals("image/jpeg", imgMetadata.get(Metadata.CONTENT_TYPE)); diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java index 593e210..b479027 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java @@ -41,8 +41,10 @@ import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.sax.ToXMLContentHandler; import org.junit.Test; import org.xml.sax.ContentHandler; @@ -145,31 +147,35 @@ public class SQLite3ParserTest extends TikaTest { Parser p = new AutoDetectParser(); RecursiveParserWrapper wrapper = - new RecursiveParserWrapper(p, new BasicContentHandlerFactory( - BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1)); + new RecursiveParserWrapper(p); Metadata metadata = new Metadata(); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1) + ); + try (InputStream is = getResourceAsStream(TEST_FILE1)) { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, TEST_FILE_NAME); - wrapper.parse(is, new BodyContentHandler(-1), metadata, new ParseContext()); + wrapper.parse(is, handler, metadata, new ParseContext()); } - List<Metadata> metadataList = wrapper.getMetadata(); + List<Metadata> metadataList = handler.getMetadataList(); int i = 0; assertEquals(5, metadataList.size()); //make sure the \t are inserted in a body handler - String table = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); + String table = metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); assertContains("0\t2.3\t2.4\tlorem", table); assertContains("普林斯顿大学", table); //make sure the \n is inserted - String table2 = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); + String table2 = metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); assertContains("do eiusmod tempor\n", table2); - assertContains("The quick brown fox", metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT)); - assertContains("The quick brown fox", metadataList.get(4).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("The quick brown fox", metadataList.get(2).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); + assertContains("The quick brown fox", metadataList.get(4).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); //confirm .doc was added to blob - assertEquals("/BYTES_COL_0.doc/image1.png", metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); + assertEquals("/BYTES_COL_0.doc/image1.png", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)); } @Test diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java index ca7befe..ebd4ac5 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java @@ -58,6 +58,7 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.parser.ocr.TesseractOCRParserTest; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.junit.BeforeClass; @@ -267,17 +268,17 @@ public class RFC822ParserTest extends TikaTest { //not treated as an attachment. TIKA-2547 List<Metadata> metadataList = getRecursiveMetadata("testRFC822_oddfrom"); assertEquals(7, metadataList.size()); - assertContains("Air Quality Planning", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("Air Quality Planning", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); //Make sure text alternative doesn't get treated as an attachment metadataList = getRecursiveMetadata("testRFC822_normal_zip"); assertEquals(3, metadataList.size()); - assertContains("This is the HTML part", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("This is the HTML part", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); assertEquals("application/zip", metadataList.get(2).get(Metadata.CONTENT_TYPE)); metadataList = getRecursiveMetadata("testRFC822-txt-body"); assertEquals(2, metadataList.size()); - assertContains("body 1", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("body 1", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } /** @@ -572,7 +573,7 @@ public class RFC822ParserTest extends TikaTest { assertEquals("text/plain; charset=UTF-8", metadataList.get(1).get(Metadata.CONTENT_TYPE)); assertEquals("image/png", metadataList.get(2).get(Metadata.CONTENT_TYPE)); assertEquals("testPNG.png", metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY)); - assertContains("This email has a PNG attachment included in it", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("This email has a PNG attachment included in it", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); assertEquals(null, metadataList.get(1).get(Metadata.CONTENT_DISPOSITION)); assertEquals("attachment; filename=\"testPNG.png\"", metadataList.get(2).get(Metadata.CONTENT_DISPOSITION)); } @@ -628,12 +629,12 @@ public class RFC822ParserTest extends TikaTest { */ List<Metadata> metadataList = getRecursiveMetadata("testRFC822-multipart"); assertEquals(2, metadataList.size()); - String body = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); + String body = metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); assertContains("body 2", body); assertNotContained("body 1", body); assertEquals("message/rfc822", metadataList.get(0).get(Metadata.CONTENT_TYPE)); assertEquals("image/gif", metadataList.get(1).get(Metadata.CONTENT_TYPE)); - assertEquals("/logo.gif", metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); + assertEquals("/logo.gif", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)); } @Test @@ -652,17 +653,17 @@ public class RFC822ParserTest extends TikaTest { List<Metadata> metadataList = getRecursiveMetadata("testRFC822-mixed-simple"); assertEquals(3, metadataList.size()); - assertContains("body 2", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); - assertNotContained("body 1", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("body 2", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); + assertNotContained("body 1", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); assertEquals("message/rfc822", metadataList.get(0).get(Metadata.CONTENT_TYPE)); assertEquals("image/jpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE)); - assertEquals("/Mary with cooler.jpeg", metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); + assertEquals("/Mary with cooler.jpeg", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)); assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString(), metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); assertEquals("image/jpeg", metadataList.get(2).get(Metadata.CONTENT_TYPE)); - assertEquals("/mary-coffee.jpg", metadataList.get(2).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); + assertEquals("/mary-coffee.jpg", metadataList.get(2).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)); assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString(), metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); } @@ -683,19 +684,19 @@ public class RFC822ParserTest extends TikaTest { */ List<Metadata> metadataList = getRecursiveMetadata("testRFC822-mixed-with-pdf-inline"); assertEquals(2, metadataList.size()); - String body = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); + String body = metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); assertContains("body 2", body); assertContains("body 3", body); assertNotContained("body 1", body); assertEquals("message/rfc822", metadataList.get(0).get(Metadata.CONTENT_TYPE)); assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE)); - assertEquals("/tzora-titan-4-hummer-xl-manual.pdf", metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); + assertEquals("/tzora-titan-4-hummer-xl-manual.pdf", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)); } @Test public void testSimpleBodyInlined() throws Exception { List<Metadata> metadataList = getRecursiveMetadata("testRFC822_simple_inline_body.txt"); assertEquals(1, metadataList.size()); - assertContains("asked", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("asked", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java index 1956e58..f75bd59 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java @@ -30,6 +30,7 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.BodyContentHandler; import org.junit.Before; import org.junit.Test; @@ -174,7 +175,7 @@ public class MboxParserTest extends TikaTest { assertEquals(2, metadataList.size()); assertEquals("application/mbox", metadataList.get(0).get(Metadata.CONTENT_TYPE)); assertEquals("message/rfc822", metadataList.get(1).get(Metadata.CONTENT_TYPE)); - assertContains("body 2", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); - assertNotContained("body 1", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("body 2", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); + assertNotContained("body 1", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java index 3825400..e6aef53 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/JackcessParserTest.java @@ -36,6 +36,7 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.junit.Test; import org.xml.sax.helpers.DefaultHandler; @@ -46,22 +47,24 @@ public class JackcessParserTest extends TikaTest { Parser p = new AutoDetectParser(); - RecursiveParserWrapper w = new RecursiveParserWrapper(p, - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + RecursiveParserWrapper w = new RecursiveParserWrapper(p); for (String fName : new String[]{"testAccess2.accdb", "testAccess2_2000.mdb", "testAccess2_2002-2003.mdb"}) { InputStream is = null; + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1) + ); try { is = this.getResourceAsStream("/test-documents/" + fName); Metadata meta = new Metadata(); ParseContext c = new ParseContext(); - w.parse(is, new DefaultHandler(), meta, c); + w.parse(is, handler, meta, c); } finally { IOUtils.closeQuietly(is); } - List<Metadata> list = w.getMetadata(); + List<Metadata> list = handler.getMetadataList(); assertEquals(4, list.size()); String mainContent = list.get(0).get(RecursiveParserWrapper.TIKA_CONTENT); @@ -83,8 +86,6 @@ public class JackcessParserTest extends TikaTest { //test embedded document handling assertContains("Test Document with embedded pdf", list.get(3).get(RecursiveParserWrapper.TIKA_CONTENT)); - - w.reset(); } } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index da228dd..c17aad2 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -38,6 +38,7 @@ import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.parser.image.ImageParser; import org.apache.tika.parser.pdf.PDFParserConfig; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.BasicContentHandlerFactory; import org.junit.Test; import org.xml.sax.helpers.DefaultHandler; @@ -181,7 +182,7 @@ public class TesseractOCRParserTest extends TikaTest { StringBuilder contents = new StringBuilder(); for (Metadata m : metadataList) { - contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT)); + contents.append(m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } for (String needle : nonOCRContains) { diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 45acf9b..55489a8 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -61,9 +61,11 @@ import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.ocr.TesseractOCRParser; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; +import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; @@ -621,8 +623,7 @@ public class PDFParserTest extends TikaTest { //"regressiveness" exists only in Unit10.doc not in the container pdf document assertTrue(xml.contains("regressiveness")); - RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(), - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); + RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser()); ParseContext context = new ParseContext(); PDFParserConfig config = new PDFParserConfig(); config.setExtractInlineImages(true); @@ -630,12 +631,14 @@ public class PDFParserTest extends TikaTest { context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config); context.set(org.apache.tika.parser.Parser.class, p); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE,-1)); try (TikaInputStream tis = TikaInputStream.get( getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"))) { - p.parse(tis, new BodyContentHandler(-1), new Metadata(), context); + p.parse(tis, handler, new Metadata(), context); } - List<Metadata> metadatas = p.getMetadata(); + List<Metadata> metadatas = handler.getMetadataList(); assertEquals(5, metadatas.size()); assertNull(metadatas.get(0).get(TikaCoreProperties.RESOURCE_NAME_KEY)); @@ -660,7 +663,7 @@ public class PDFParserTest extends TikaTest { List<Metadata> metadatas = getRecursiveMetadata("testPDF_JBIG2.pdf", context); assertEquals(2, metadatas.size()); - assertContains("test images compressed using JBIG2", metadatas.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("test images compressed using JBIG2", metadatas.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); for (String key : metadatas.get(1).names()) { if (key.startsWith("X-TIKA:EXCEPTION")) { @@ -894,13 +897,13 @@ public class PDFParserTest extends TikaTest { assertEquals("metadata size", 5, metadatas.size()); assertEquals("file name", "Test.txt", metadatas.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY)); - assertContains("os specific", metadatas.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("os specific", metadatas.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); assertEquals("file name", "TestMac.txt", metadatas.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY)); - assertContains("mac embedded", metadatas.get(2).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("mac embedded", metadatas.get(2).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); assertEquals("file name", "TestDos.txt", metadatas.get(3).get(TikaCoreProperties.RESOURCE_NAME_KEY)); - assertContains("dos embedded", metadatas.get(3).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("dos embedded", metadatas.get(3).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); assertEquals("file name", "TestUnix.txt", metadatas.get(4).get(TikaCoreProperties.RESOURCE_NAME_KEY)); - assertContains("unix embedded", metadatas.get(4).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("unix embedded", metadatas.get(4).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java index 9a1d579..47919e9 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java @@ -40,6 +40,7 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.junit.BeforeClass; import org.junit.Test; @@ -83,7 +84,7 @@ public class CompressorParserTest extends TikaTest { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "testBROTLI_compressed.br"); List<Metadata> metadataList = getRecursiveMetadata("testBROTLI_compressed.br", metadata); - assertContains("XXXXXXXXXXYYYYYYYYYY", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("XXXXXXXXXXYYYYYYYYYY", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); assertEquals("testBROTLI_compressed", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY)); } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java index 9fc0e81..ae7eb96 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java @@ -38,6 +38,7 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.BodyContentHandler; import org.junit.Test; import org.xml.sax.ContentHandler; @@ -212,7 +213,7 @@ public class ZipParserTest extends AbstractPkgTest { assertContains("EncryptedDocumentException: stream (encrypted.txt) is encrypted", values[0]); - assertContains("hello world", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("hello world", metadataList.get(1).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } @Test diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java index 20ad4f9..05113dd 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java @@ -49,8 +49,10 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.sax.WriteOutContentHandler; import org.junit.Test; import org.xml.sax.ContentHandler; @@ -441,7 +443,7 @@ public class RTFParserTest extends TikaTest { //directory: _1457338524/HW.txt assertEquals("filename equals ", p.fileName, FilenameUtils.getName( - metadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH))); + metadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH))); assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE)); } @@ -454,15 +456,15 @@ public class RTFParserTest extends TikaTest { public void testRegularImages() throws Exception { Parser base = new AutoDetectParser(); ParseContext ctx = new ParseContext(); - RecursiveParserWrapper parser = new RecursiveParserWrapper(base, - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); - ContentHandler handler = new BodyContentHandler(); + RecursiveParserWrapper parser = new RecursiveParserWrapper(base); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1),-1); Metadata rootMetadata = new Metadata(); rootMetadata.add(TikaCoreProperties.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf"); try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) { parser.parse(tis, handler, rootMetadata, ctx); } - List<Metadata> metadatas = parser.getMetadata(); + List<Metadata> metadatas = handler.getMetadataList(); Metadata meta_jpg_exif = metadatas.get(1);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg"); Metadata meta_jpg = metadatas.get(3);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg"); diff --git a/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/PrettyMetadataKeyComparator.java b/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/PrettyMetadataKeyComparator.java index 9a18a8a..5516c1d 100644 --- a/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/PrettyMetadataKeyComparator.java +++ b/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/PrettyMetadataKeyComparator.java @@ -26,7 +26,7 @@ public class PrettyMetadataKeyComparator implements java.util.Comparator<String> return -1; } - //this is stinky. This should reference RecursiveParserWrapper.TIKA_CONTENT + //this is stinky. This should reference AbstractRecursiveParserWrapperHandler.TIKA_CONTENT //but that would require making core a dependency of serialization... //do we want to do that? if (s1.equals("tika:content")) { diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java index 739794c..0658fc4 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java @@ -37,6 +37,7 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.server.MetadataList; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -128,20 +129,28 @@ public class RecursiveMetadataResource { final ParseContext context = new ParseContext(); Parser parser = TikaResource.createParser(); // TODO: parameterize choice of max chars/max embedded attachments - BasicContentHandlerFactory.HANDLER_TYPE type = - BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE); - RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, - new BasicContentHandlerFactory(type, -1)); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); + + TikaResource.fillMetadata(parser, metadata, context, httpHeaders); // no need to add parser to parse recursively TikaResource.fillParseContext(context, httpHeaders, null); TikaResource.logRequest(LOG, info, metadata); - TikaResource.parse(wrapper, LOG, info.getPath(), is, - new LanguageHandler() { + + BasicContentHandlerFactory.HANDLER_TYPE type = + BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory(type, -1), -1); + TikaResource.parse(wrapper, LOG, info.getPath(), is, handler, metadata, context); + /* + We used to have this non-functional bit of code...refactor to add it back and make it work? + new LanguageHandler() { public void endDocument() { metadata.set("language", getLanguage().getLanguage()); } - }, metadata, context); - return new MetadataList(wrapper.getMetadata()); + }, + */ + return new MetadataList(handler.getMetadataList()); } + } diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java index 783d622..c50f989 100644 --- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java +++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java @@ -40,6 +40,7 @@ import org.apache.tika.metadata.OfficeOpenXMLExtended; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.serialization.JsonMetadataList; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.server.resource.RecursiveMetadataResource; import org.apache.tika.server.writer.MetadataListMessageBodyWriter; import org.junit.Test; @@ -130,7 +131,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); List<Metadata> metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); - String content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim(); + String content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim(); assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">")); //extra slash @@ -142,7 +143,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); - content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim(); + content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim(); assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">")); //unparseable @@ -154,7 +155,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); - content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim(); + content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim(); assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">")); //xml @@ -166,7 +167,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); - content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim(); + content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim(); assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">")); //text @@ -178,7 +179,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); - content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim(); + content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim(); assertTrue(content.startsWith("embed_3")); //ignore @@ -190,7 +191,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); - assertNull(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertNull(metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } @@ -209,7 +210,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); List<Metadata> metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); - String content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim(); + String content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim(); assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">")); //unparseable @@ -225,7 +226,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); - content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim(); + content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim(); assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">")); //xml @@ -241,7 +242,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); - content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim(); + content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim(); assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">")); //text @@ -257,7 +258,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); - content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim(); + content = metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim(); assertTrue(content.startsWith("embed_3")); //ignore -- no content @@ -273,7 +274,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); - assertNull(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertNull(metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT)); } } -- To stop receiving notification emails like this one, please contact talli...@apache.org.