Author: tallison Date: Fri Sep 19 19:18:08 2014 New Revision: 1626300 URL: http://svn.apache.org/r1626300 Log: TIKA-1329 add RecursiveParserWrapper
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.docx (with props) Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java?rev=1626300&view=auto ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java (added) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java Fri Sep 19 19:18:08 2014 @@ -0,0 +1,326 @@ +package org.apache.tika.parser; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.FilenameUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaMetadataKeys; +import org.apache.tika.mime.MediaType; +import org.apache.tika.sax.ContentHandlerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Date; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +/** + * This is a helper class that wraps a parser in a recursive handler. + * It takes care of setting the embedded parser in the ParseContext + * and handling the embedded path calculations. + * <p> + * After parsing a document, call getMetadata() to retrieve a list of + * Metadata objects, one for each embedded resource. The first item + * in the list will contain the Metadata for the outer container file. + * <p> + * Content can also be extracted and stored in the {@link #TIKA_CONTENT} field + * of a Metadata object. Select the type of content to be stored + * at initialization. + * <p> + * If a WriteLimitReachedException is encountered, the wrapper will stop + * processing the current resource, and it will not process + * any of the child resources for the given resource. However, it will try to + * parse as much as it can. If a WLRE is reached in the parent document, + * no child resources will be parsed. + * <p> + * The implementation is based on Jukka's RecursiveMetadataParser + * and Nick's additions. See: + * <a href="http://wiki.apache.org/tika/RecursiveMetadata#Jukka.27s_RecursiveMetadata_Parser">RecursiveMetadataParser</a>. + * <p> + * Note that this wrapper holds all data in memory and is not appropriate + * for files with content too large to be held in memory. + * <p> + * Note, too, that this wrapper is not thread safe because it stores state. + * The client must initialize a new wrapper for each thread, and the client + * is responsible for calling {@link #reset()} after each parse. + * <p> + * The unit tests for this class are in the tika-parsers module. + * </p> + */ +public class RecursiveParserWrapper implements Parser { + + /** + * Generated serial version + */ + private static final long serialVersionUID = 9086536568120690938L; + + + + public final static String TIKA_PREFIX = "tika:"; + public final static String TIKA_EXCEPTION_PREFIX = "tika_ex:"; + + //move this to TikaCoreProperties? + public final static Property TIKA_CONTENT = Property.internalText(TIKA_PREFIX+"content"); + public final static Property PARSE_TIME_MILLIS = Property.internalText(TIKA_PREFIX+"parse_time_millis"); + public final static Property WRITE_LIMIT_REACHED = + Property.internalBoolean(TIKA_EXCEPTION_PREFIX+"write_limit_reached"); + public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED = + Property.internalBoolean(TIKA_EXCEPTION_PREFIX+"embedded_resource_limit_reached"); + + public final static Property PARSE_EXCEPTION = + Property.internalBoolean(TIKA_EXCEPTION_PREFIX+"parse_exception"); + + //move this to TikaCoreProperties? + public final static Property EMBEDDED_RESOURCE_PATH = + Property.internalText(TIKA_PREFIX+"embedded_resource_path"); + + private final Parser wrappedParser; + private final ContentHandlerFactory contentHandlerFactory; + private final List<Metadata> metadatas = new LinkedList<Metadata>(); + + //used in naming embedded resources that don't have a name. + private int unknownCount = 0; + private int maxEmbeddedResources = -1; + private boolean hitMaxEmbeddedResources = false; + + public RecursiveParserWrapper(Parser wrappedParser, ContentHandlerFactory contentHandlerFactory) { + this.wrappedParser = wrappedParser; + this.contentHandlerFactory = contentHandlerFactory; + } + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return wrappedParser.getSupportedTypes(context); + } + + /** + * Acts like a regular parser except it ignores the ContentHandler + * and it automatically sets/overwrites the embedded Parser in the + * ParseContext object. + * <p> + * To retrieve the results of the parse, use {@link #getMetadata()}. + * <p> + * Make sure to call {@link #reset()} after each parse. + */ + @Override + public void parse(InputStream stream, ContentHandler ignore, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + + String name = getResourceName(metadata); + EmbeddedParserDecorator decorator = new EmbeddedParserDecorator(name); + context.set(Parser.class, decorator); + ContentHandler localHandler = contentHandlerFactory.getNewContentHandler(); + long started = new Date().getTime(); + try { + wrappedParser.parse(stream, localHandler, metadata, context); + } catch (SAXException e) { + boolean wlr = isWriteLimitReached(e); + if (wlr == false) { + throw e; + } + metadata.set(WRITE_LIMIT_REACHED, "true"); + } + long elapsedMillis = new Date().getTime()-started; + metadata.set(PARSE_TIME_MILLIS, Long.toString(elapsedMillis)); + addContent(localHandler, metadata); + + if (hitMaxEmbeddedResources) { + metadata.set(EMBEDDED_RESOURCE_LIMIT_REACHED, "true"); + } + metadatas.add(0, deepCopy(metadata)); + } + + /** + * + * The first element in the returned list represents the + * data from the outer container file. There is no guarantee + * about the ordering of the list after that. + * + * @return list of Metadata objects that were gathered during the parse + */ + public List<Metadata> getMetadata() { + return metadatas; + } + + /** + * Set the maximum number of embedded resources to store. + * If the max is hit during parsing, the {@link #EMBEDDED_RESOURCE_LIMIT_REACHED} + * property will be added to the container document's Metadata. + * + * <p> + * If this value is < 0 (the default), the wrapper will store all Metadata. + * + * @param max maximum number of embedded resources to store + */ + public void setMaxEmbeddedResources(int max) { + maxEmbeddedResources = max; + } + + + /** + * This clears the metadata list and resets {@link #unknownCount} and + * {@link #hitMaxEmbeddedResources} + */ + public void reset() { + metadatas.clear(); + unknownCount = 0; + hitMaxEmbeddedResources = false; + } + + /** + * Copied/modified from WriteOutContentHandler. Couldn't make that + * static, and we need to have something that will work + * with exceptions thrown from both BodyContentHandler and WriteOutContentHandler + * @param t + * @return + */ + private boolean isWriteLimitReached(Throwable t) { + if (t.getMessage().indexOf("Your document contained more than") == 0) { + return true; + } else { + return t.getCause() != null && isWriteLimitReached(t.getCause()); + } + } + + //defensive copy + private Metadata deepCopy(Metadata m) { + Metadata clone = new Metadata(); + + for (String n : m.names()){ + if (! m.isMultiValued(n)) { + clone.set(n, m.get(n)); + } else { + String[] vals = m.getValues(n); + for (int i = 0; i < vals.length; i++) { + clone.add(n, vals[i]); + } + } + } + return clone; + } + + private String getResourceName(Metadata metadata) { + String objectName = ""; + if (metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY) != null) { + objectName = metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY); + } else if (metadata.get(TikaMetadataKeys.EMBEDDED_RELATIONSHIP_ID) != null) { + objectName = metadata.get(TikaMetadataKeys.EMBEDDED_RELATIONSHIP_ID); + } else { + objectName = "embedded-" + (++unknownCount); + } + //make sure that there isn't any path info in the objectName + //some parsers can return paths, not just file names + objectName = FilenameUtils.getName(objectName); + return objectName; + } + + private void addContent(ContentHandler handler, Metadata metadata) { + + if (handler.getClass().equals(DefaultHandler.class)){ + //no-op: we can't rely on just testing for + //empty content because DefaultHandler's toString() + //returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd" + } else { + String content = handler.toString(); + if (content != null && content.trim().length() > 0 ) { + metadata.add(TIKA_CONTENT, content); + } + } + + } + + /** + * Override for different behavior. + * + * @return handler to be used for each document + */ + + + private class EmbeddedParserDecorator extends ParserDecorator { + + private static final long serialVersionUID = 207648200464263337L; + + private String location = null; + + + private EmbeddedParserDecorator(String location) { + super(wrappedParser); + this.location = location; + if (! this.location.endsWith("/")) { + this.location += "/"; + } + } + + @Override + public void parse(InputStream stream, ContentHandler ignore, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + //Test to see if we should avoid parsing + if (maxEmbeddedResources > -1 && + metadatas.size() >= maxEmbeddedResources) { + hitMaxEmbeddedResources = true; + return; + } + // Work out what this thing is + String objectName = getResourceName(metadata); + String objectLocation = this.location + objectName; + + metadata.add(EMBEDDED_RESOURCE_PATH, objectLocation); + + //ignore the content handler that is passed in + //and get a fresh handler + ContentHandler localHandler = contentHandlerFactory.getNewContentHandler(); + + Parser preContextParser = context.get(Parser.class); + context.set(Parser.class, new EmbeddedParserDecorator(objectLocation)); + + try { + super.parse(stream, localHandler, metadata, context); + } catch (SAXException e) { + boolean wlr = isWriteLimitReached(e); + if (wlr == true) { + metadata.add(WRITE_LIMIT_REACHED, "true"); + } else { + throw e; + } + } finally { + context.set(Parser.class, preContextParser); + } + + //Because of recursion, we need + //to re-test to make sure that we limit the + //number of stored resources + if (maxEmbeddedResources > -1 && + metadatas.size() >= maxEmbeddedResources) { + hitMaxEmbeddedResources = true; + return; + } + addContent(localHandler, metadata); + metadatas.add(deepCopy(metadata)); + } + } + + +} Added: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java?rev=1626300&view=auto ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java (added) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java Fri Sep 19 19:18:08 2014 @@ -0,0 +1,126 @@ +package org.apache.tika.sax; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.DefaultHandler; + +import java.io.OutputStream; +import java.io.UnsupportedEncodingException; + +/** + * Basic factory for creating common types of ContentHandlers + */ +public class BasicContentHandlerFactory implements ContentHandlerFactory { + + /** + * Common handler types for content. + */ + public enum HANDLER_TYPE { + BODY, + IGNORE, //don't store content + TEXT, + HTML, + XML + }; + + private final HANDLER_TYPE type; + private final int writeLimit; + + /** + * + * @param type basic type of handler + * @param writeLimit max number of characters to store; if < 0, the handler will store all characters + */ + public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit) { + this.type = type; + this.writeLimit = writeLimit; + } + + @Override + public ContentHandler getNewContentHandler() { + + if (writeLimit > -1) { + switch(type) { + case BODY: + return new BodyContentHandler(writeLimit); + case IGNORE: + return new DefaultHandler(); + case TEXT: + return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit); + case HTML: + return new WriteOutContentHandler(new ToHTMLContentHandler(), writeLimit); + case XML: + return new WriteOutContentHandler(new ToXMLContentHandler(), writeLimit); + default: + return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit); + } + } else { + switch (type) { + case BODY: + return new BodyContentHandler(); + case IGNORE: + return new DefaultHandler(); + case TEXT: + return new ToTextContentHandler(); + case HTML: + return new ToHTMLContentHandler(); + case XML: + return new ToXMLContentHandler(); + default: + return new ToTextContentHandler(); + + } + } + } + + @Override + public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException { + if (writeLimit > -1) { + switch(type) { + case BODY: + return new WriteOutContentHandler(new BodyContentHandler(new ToTextContentHandler(os, encoding)), writeLimit); + case IGNORE: + return new DefaultHandler(); + case TEXT: + return new WriteOutContentHandler(new ToTextContentHandler(os, encoding), writeLimit); + case HTML: + return new WriteOutContentHandler(new ToHTMLContentHandler(os, encoding), writeLimit); + case XML: + return new WriteOutContentHandler(new ToXMLContentHandler(os, encoding), writeLimit); + default: + return new WriteOutContentHandler(new ToTextContentHandler(os, encoding), writeLimit); + } + } else { + switch (type) { + case BODY: + return new BodyContentHandler(new ToTextContentHandler(os, encoding)); + case IGNORE: + return new DefaultHandler(); + case TEXT: + return new ToTextContentHandler(os, encoding); + case HTML: + return new ToHTMLContentHandler(os, encoding); + case XML: + return new ToXMLContentHandler(os, encoding); + default: + return new ToTextContentHandler(os, encoding); + + } + } + } + +} Added: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java?rev=1626300&view=auto ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java (added) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java Fri Sep 19 19:18:08 2014 @@ -0,0 +1,32 @@ +package org.apache.tika.sax; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.xml.sax.ContentHandler; + +import java.io.OutputStream; +import java.io.UnsupportedEncodingException; + +/** + * Interface to allow easier injection of code for getting a new ContentHandler + */ +public interface ContentHandlerFactory { + public ContentHandler getNewContentHandler(); + public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException; + +} Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1626300&r1=1626299&r2=1626300&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Fri Sep 19 19:18:08 2014 @@ -16,9 +16,17 @@ */ package org.apache.tika; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; +import org.apache.tika.extractor.EmbeddedResourceHandler; +import org.apache.tika.io.IOUtils; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ToXMLContentHandler; +import org.xml.sax.ContentHandler; import java.io.ByteArrayOutputStream; import java.io.File; @@ -31,21 +39,9 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import org.apache.tika.exception.TikaException; -import org.apache.tika.extractor.EmbeddedResourceHandler; -import org.apache.tika.io.IOUtils; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.parser.ParserDecorator; -import org.apache.tika.sax.BodyContentHandler; -import org.apache.tika.sax.ToXMLContentHandler; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; /** * Parent class of Tika tests @@ -200,56 +196,4 @@ public abstract class TikaTest { } } } - - /** - * Stores metadata and (optionally) content. - * Many thanks to Jukka's example: - * http://wiki.apache.org/tika/RecursiveMetadata - * This ignores the incoming handler and applies a - * new BodyContentHandler(-1) for each file. - */ - public static class RecursiveMetadataParser extends ParserDecorator { - /** Key for content string if stored */ - public static final String TIKA_CONTENT = "tika:content"; - - private static final long serialVersionUID = 1L; - - private List<Metadata> metadatas = new ArrayList<Metadata>(); - private final boolean storeContent; - - public RecursiveMetadataParser(Parser parser, - boolean storeContent) { - super(parser); - this.storeContent = storeContent; - } - - @Override - public void parse( - InputStream stream, ContentHandler ignoredHandler, - Metadata metadata, ParseContext context) - throws IOException, SAXException, TikaException { - - ContentHandler contentHandler = null; - if (storeContent) { - contentHandler = new BodyContentHandler(-1); - } else { - contentHandler = new DefaultHandler(); - } - super.parse(stream, contentHandler, metadata, context); - - if (storeContent) { - metadata.add(TIKA_CONTENT, contentHandler.toString()); - } - metadatas.add(metadata); - } - - public List<Metadata> getAllMetadata() { - return metadatas; - } - - public void clear() { - metadatas.clear(); - } - } - } Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java?rev=1626300&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java (added) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java Fri Sep 19 19:18:08 2014 @@ -0,0 +1,202 @@ +package org.apache.tika.parser; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; +import org.junit.Test; +import org.xml.sax.helpers.DefaultHandler; + +import java.io.InputStream; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +public class RecursiveParserWrapperTest { + + @Test + public void testBasicXML() throws Exception { + List<Metadata> list = getMetadata(new Metadata(), + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + Metadata container = list.get(0); + String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); + //not much differentiates html from xml in this test file + assertTrue(content.indexOf("<p class=\"header\" />") > -1); + } + + @Test + public void testBasicHTML() throws Exception { + List<Metadata> list = getMetadata(new Metadata(), + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1)); + Metadata container = list.get(0); + String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); + //not much differentiates html from xml in this test file + assertTrue(content.indexOf("<p class=\"header\"></p>") > -1); + } + + @Test + public void testBasicText() throws Exception { + List<Metadata> list = getMetadata(new Metadata(), + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); + Metadata container = list.get(0); + String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); + assertTrue(content.indexOf("<p ") < 0); + assertTrue(content.indexOf("embed_0") > -1); + } + + @Test + public void testIgnoreContent() throws Exception { + List<Metadata> list = getMetadata(new Metadata(), + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); + Metadata container = list.get(0); + String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); + assertNull(content); + } + + + @Test + public void testCharLimit() throws Exception { + ParseContext context = new ParseContext(); + Metadata metadata = new Metadata(); + + Parser wrapped = new AutoDetectParser(); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60)); + InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream( + "/test-documents/test_recursive_embedded.docx"); + wrapper.parse(stream, new DefaultHandler(), metadata, context); + List<Metadata> list = wrapper.getMetadata(); + + assertEquals(5, list.size()); + + int wlr = 0; + for (Metadata m : list) { + String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED); + if (limitReached != null && limitReached.equals("true")){ + wlr++; + } + } + assertEquals(1, wlr); + + } + @Test + public void testMaxEmbedded() throws Exception { + int maxEmbedded = 4; + int totalNoLimit = 12;//including outer container file + ParseContext context = new ParseContext(); + Metadata metadata = new Metadata(); + String limitReached = null; + + Parser wrapped = new AutoDetectParser(); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); + + InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream( + "/test-documents/test_recursive_embedded.docx"); + wrapper.parse(stream, new DefaultHandler(), metadata, context); + List<Metadata> list = wrapper.getMetadata(); + //test default + assertEquals(totalNoLimit, list.size()); + + limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED); + assertNull(limitReached); + + + wrapper.reset(); + stream.close(); + + //test setting value + metadata = new Metadata(); + stream = RecursiveParserWrapperTest.class.getResourceAsStream( + "/test-documents/test_recursive_embedded.docx"); + wrapper.setMaxEmbeddedResources(maxEmbedded); + wrapper.parse(stream, new DefaultHandler(), metadata, context); + list = wrapper.getMetadata(); + + //add 1 for outer container file + assertEquals(maxEmbedded+1, list.size()); + + limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED); + assertEquals("true", limitReached); + + wrapper.reset(); + stream.close(); + + //test setting value < 0 + metadata = new Metadata(); + stream = RecursiveParserWrapperTest.class.getResourceAsStream( + "/test-documents/test_recursive_embedded.docx"); + + wrapper.setMaxEmbeddedResources(-2); + wrapper.parse(stream, new DefaultHandler(), metadata, context); + assertEquals(totalNoLimit, list.size()); + limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED); + assertNull(limitReached); + } + + @Test + public void testEmbeddedResourcePath() throws Exception { + + Set<String> targets = new HashSet<String>(); + targets.add("test_recursive_embedded.docx/embed1.zip"); + targets.add("test_recursive_embedded.docx/embed1.zip/embed2.zip"); + targets.add("test_recursive_embedded.docx/embed1.zip/embed2.zip/embed3.zip"); + targets.add("test_recursive_embedded.docx/embed1.zip/embed2.zip/embed3.zip/embed4.zip"); + targets.add("test_recursive_embedded.docx/embed1.zip/embed2.zip/embed3.zip/embed4.zip/embed4.txt"); + targets.add("test_recursive_embedded.docx/embed1.zip/embed2.zip/embed3.zip/embed3.txt"); + targets.add("test_recursive_embedded.docx/embed1.zip/embed2.zip/embed2a.txt"); + targets.add("test_recursive_embedded.docx/embed1.zip/embed2.zip/embed2b.txt"); + targets.add("test_recursive_embedded.docx/embed1.zip/embed1b.txt"); + targets.add("test_recursive_embedded.docx/embed1.zip/embed1a.txt"); + targets.add("test_recursive_embedded.docx/image1.emf"); + + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx"); + List<Metadata> list = getMetadata(metadata, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + Metadata container = list.get(0); + String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); + assertTrue(content.indexOf("<p class=\"header\" />") > -1); + + Set<String> seen = new HashSet<String>(); + for (Metadata m : list) { + String path = m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH); + if (path != null) { + seen.add(path); + } + } + assertEquals(targets, seen); + } + + private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory) + throws Exception{ + ParseContext context = new ParseContext(); + Parser wrapped = new AutoDetectParser(); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, contentHandlerFactory); + InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream( + "/test-documents/test_recursive_embedded.docx"); + wrapper.parse(stream, new DefaultHandler(), metadata, context); + return wrapper.getMetadata(); + } +} Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1626300&r1=1626299&r2=1626300&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Fri Sep 19 19:18:08 2014 @@ -16,21 +16,6 @@ */ package org.apache.tika.parser.pdf; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertTrue; - -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStream; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Set; - import org.apache.tika.TikaTest; import org.apache.tika.extractor.ContainerExtractor; import org.apache.tika.extractor.DocumentSelector; @@ -44,10 +29,27 @@ import org.apache.tika.parser.AutoDetect import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; import org.junit.Test; import org.xml.sax.ContentHandler; + +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; /** * Test case for parsing pdf files. */ @@ -667,7 +669,8 @@ public class PDFParserTest extends TikaT //"regressiveness" exists only in Unit10.doc not in the container pdf document assertTrue(xml.contains("regressiveness")); - RecursiveMetadataParser p = new RecursiveMetadataParser(new AutoDetectParser(), false); + RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(), + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); TikaInputStream tis = null; ParseContext context = new ParseContext(); PDFParserConfig config = new PDFParserConfig(); @@ -686,16 +689,17 @@ public class PDFParserTest extends TikaT } } - List<Metadata> metadatas = p.getAllMetadata(); + List<Metadata> metadatas = p.getMetadata(); + assertEquals(5, metadatas.size()); assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY)); assertNull(metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY)); - assertEquals("Press Quality(1).joboptions", metadatas.get(2).get(Metadata.RESOURCE_NAME_KEY)); - assertEquals("Unit10.doc", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY)); - assertEquals(MediaType.image("jpeg").toString(), metadatas.get(0).get(Metadata.CONTENT_TYPE)); - assertEquals(MediaType.image("tiff").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE)); - assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(2).get(Metadata.CONTENT_TYPE)); - assertEquals(TYPE_DOC.toString(), metadatas.get(3).get(Metadata.CONTENT_TYPE)); + assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY)); + assertEquals("Unit10.doc", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY)); + assertEquals(MediaType.image("jpeg").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE)); + assertEquals(MediaType.image("tiff").toString(), metadatas.get(2).get(Metadata.CONTENT_TYPE)); + assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(3).get(Metadata.CONTENT_TYPE)); + assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE)); } @@ -849,7 +853,8 @@ public class PDFParserTest extends TikaT Parser defaultParser = new AutoDetectParser(); - RecursiveMetadataParser p = new RecursiveMetadataParser(defaultParser, false); + RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); ParseContext context = new ParseContext(); context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config); context.set(org.apache.tika.parser.Parser.class, p); @@ -860,7 +865,7 @@ public class PDFParserTest extends TikaT p.parse(stream, handler, metadata, context); - List<Metadata> metadatas = p.getAllMetadata(); + List<Metadata> metadatas = p.getMetadata(); int inline = 0; int attach = 0; for (Metadata m : metadatas) { @@ -877,7 +882,7 @@ public class PDFParserTest extends TikaT assertEquals(2, attach); stream.close(); - p.clear(); + p.reset(); //now try turning off inline stream = TikaInputStream.get(this.getClass().getResource(path)); @@ -889,7 +894,7 @@ public class PDFParserTest extends TikaT metadata = new Metadata(); p.parse(stream, handler, metadata, context); - metadatas = p.getAllMetadata(); + metadatas = p.getMetadata(); for (Metadata m : metadatas) { String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); if (v != null) { @@ -910,7 +915,8 @@ public class PDFParserTest extends TikaT public void testInlineConfig() throws Exception { Parser defaultParser = new AutoDetectParser(); - RecursiveMetadataParser p = new RecursiveMetadataParser(defaultParser, false); + RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); ParseContext context = new ParseContext(); context.set(org.apache.tika.parser.Parser.class, p); Metadata metadata = new Metadata(); @@ -920,7 +926,7 @@ public class PDFParserTest extends TikaT p.parse(stream, handler, metadata, context); - List<Metadata> metadatas = p.getAllMetadata(); + List<Metadata> metadatas = p.getMetadata(); int inline = 0; int attach = 0; for (Metadata m : metadatas) { @@ -937,7 +943,7 @@ public class PDFParserTest extends TikaT assertEquals(2, attach); stream.close(); - p.clear(); + p.reset(); //now try turning off inline stream = TikaInputStream.get(this.getClass().getResource(path)); @@ -952,7 +958,7 @@ public class PDFParserTest extends TikaT metadata = new Metadata(); p.parse(stream, handler, metadata, context); - metadatas = p.getAllMetadata(); + metadatas = p.getMetadata(); for (Metadata m : metadatas) { String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); if (v != null) { @@ -971,16 +977,18 @@ public class PDFParserTest extends TikaT public void testEmbeddedFileNameExtraction() throws Exception { InputStream is = PDFParserTest.class.getResourceAsStream( "/test-documents/testPDF_multiFormatEmbFiles.pdf"); - RecursiveMetadataParser p = new RecursiveMetadataParser(new AutoDetectParser(), false); + RecursiveParserWrapper p = new RecursiveParserWrapper( + new AutoDetectParser(), + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); Metadata m = new Metadata(); ParseContext c = new ParseContext(); c.set(org.apache.tika.parser.Parser.class, p); ContentHandler h = new BodyContentHandler(); p.parse(is, h, m, c); is.close(); - List<Metadata> metadatas = p.getAllMetadata(); + List<Metadata> metadatas = p.getMetadata(); assertEquals("metadata size", 5, metadatas.size()); - Metadata firstAttachment = metadatas.get(0); + Metadata firstAttachment = metadatas.get(1); assertEquals("attachment file name", "Test.txt", firstAttachment.get(Metadata.RESOURCE_NAME_KEY)); } @@ -988,24 +996,26 @@ public class PDFParserTest extends TikaT public void testOSSpecificEmbeddedFileExtraction() throws Exception { InputStream is = PDFParserTest.class.getResourceAsStream( "/test-documents/testPDF_multiFormatEmbFiles.pdf"); - RecursiveMetadataParser p = new RecursiveMetadataParser(new AutoDetectParser(), true); + RecursiveParserWrapper p = new RecursiveParserWrapper( + new AutoDetectParser(), + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); Metadata m = new Metadata(); ParseContext c = new ParseContext(); c.set(org.apache.tika.parser.Parser.class, p); ContentHandler h = new BodyContentHandler(); p.parse(is, h, m, c); is.close(); - List<Metadata> metadatas = p.getAllMetadata(); + List<Metadata> metadatas = p.getMetadata(); assertEquals("metadata size", 5, metadatas.size()); - assertEquals("file name", "Test.txt", metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY)); - assertContains("os specific", metadatas.get(0).get(RecursiveMetadataParser.TIKA_CONTENT)); - assertEquals("file name", "TestMac.txt", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY)); - assertContains("mac embedded", metadatas.get(1).get(RecursiveMetadataParser.TIKA_CONTENT)); - assertEquals("file name", "TestDos.txt", metadatas.get(2).get(Metadata.RESOURCE_NAME_KEY)); - assertContains("dos embedded", metadatas.get(2).get(RecursiveMetadataParser.TIKA_CONTENT)); - assertEquals("file name", "TestUnix.txt", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY)); - assertContains("unix embedded", metadatas.get(3).get(RecursiveMetadataParser.TIKA_CONTENT)); + assertEquals("file name", "Test.txt", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY)); + assertContains("os specific", metadatas.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertEquals("file name", "TestMac.txt", metadatas.get(2).get(Metadata.RESOURCE_NAME_KEY)); + assertContains("mac embedded", metadatas.get(2).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertEquals("file name", "TestDos.txt", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY)); + assertContains("dos embedded", metadatas.get(3).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertEquals("file name", "TestUnix.txt", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY)); + assertContains("unix embedded", metadatas.get(4).get(RecursiveParserWrapper.TIKA_CONTENT)); } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1626300&r1=1626299&r2=1626300&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Fri Sep 19 19:18:08 2014 @@ -16,23 +16,6 @@ */ package org.apache.tika.parser.rtf; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertNotNull; - -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStream; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - - import org.apache.tika.Tika; import org.apache.tika.TikaTest; import org.apache.tika.extractor.ContainerExtractor; @@ -48,11 +31,29 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.WriteOutContentHandler; import org.junit.Test; import org.xml.sax.ContentHandler; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + /** * Junit test class for the Tika {@link RTFParser} */ @@ -516,7 +517,8 @@ public class RTFParserTest extends TikaT public void testRegularImages() throws Exception { Parser base = new AutoDetectParser(); ParseContext ctx = new ParseContext(); - RecursiveMetadataParser parser = new RecursiveMetadataParser(base, false); + RecursiveParserWrapper parser = new RecursiveParserWrapper(base, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); ctx.set(org.apache.tika.parser.Parser.class, parser); TikaInputStream tis = null; ContentHandler handler = new BodyContentHandler(); @@ -528,7 +530,7 @@ public class RTFParserTest extends TikaT } finally { tis.close(); } - List<Metadata> metadatas = parser.getAllMetadata(); + List<Metadata> metadatas = parser.getMetadata(); Metadata meta_jpg_exif = metadatas.get(0);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg"); Metadata meta_jpg = metadatas.get(2);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg"); Added: tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.docx URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.docx?rev=1626300&view=auto ============================================================================== Binary file - no diff available. Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.docx ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream