Author: tallison Date: Mon Oct 27 17:00:03 2014 New Revision: 1634594 URL: http://svn.apache.org/r1634594 Log: TIKA-1459 fix write limit bug in BasicContentHandlerFactory when creating a BodyContentHandler
Added: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=1634594&r1=1634593&r2=1634594&view=diff ============================================================================== --- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java (original) +++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Mon Oct 27 17:00:03 2014 @@ -121,7 +121,7 @@ public class TikaGUI extends JFrame } //maximum length to allow for mark for reparse to get JSON - private final int MAX_MARK = 20971520;//20MB + private final int MAX_MARK = 20*1024*1024;//20MB /** * Parsing context. */ @@ -379,7 +379,8 @@ public class TikaGUI extends JFrame } if (isReset) { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1)); + new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1)); wrapper.parse(input, null, new Metadata(), new ParseContext()); StringWriter jsonBuffer = new StringWriter(); JsonMetadataList.setPrettyPrinting(true); Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java?rev=1634594&r1=1634593&r2=1634594&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java Mon Oct 27 17:00:03 2014 @@ -15,12 +15,14 @@ package org.apache.tika.sax; * See the License for the specific language governing permissions and * limitations under the License. */ -import org.xml.sax.ContentHandler; -import org.xml.sax.helpers.DefaultHandler; import java.io.OutputStream; +import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; +import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.DefaultHandler; + /** * Basic factory for creating common types of ContentHandlers */ @@ -53,12 +55,13 @@ public class BasicContentHandlerFactory @Override public ContentHandler getNewContentHandler() { + if (type == HANDLER_TYPE.BODY) { + return new BodyContentHandler(writeLimit); + } else if (type == HANDLER_TYPE.IGNORE) { + return new DefaultHandler(); + } if (writeLimit > -1) { switch(type) { - case BODY: - return new BodyContentHandler(writeLimit); - case IGNORE: - return new DefaultHandler(); case TEXT: return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit); case HTML: @@ -70,10 +73,6 @@ public class BasicContentHandlerFactory } } else { switch (type) { - case BODY: - return new BodyContentHandler(); - case IGNORE: - return new DefaultHandler(); case TEXT: return new ToTextContentHandler(); case HTML: @@ -89,12 +88,17 @@ public class BasicContentHandlerFactory @Override public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException { + + if (type == HANDLER_TYPE.IGNORE) { + return new DefaultHandler(); + } + if (writeLimit > -1) { switch(type) { case BODY: - return new WriteOutContentHandler(new BodyContentHandler(new ToTextContentHandler(os, encoding)), writeLimit); - case IGNORE: - return new DefaultHandler(); + return new WriteOutContentHandler( + new BodyContentHandler( + new OutputStreamWriter(os, encoding)), writeLimit); case TEXT: return new WriteOutContentHandler(new ToTextContentHandler(os, encoding), writeLimit); case HTML: @@ -107,9 +111,7 @@ public class BasicContentHandlerFactory } else { switch (type) { case BODY: - return new BodyContentHandler(new ToTextContentHandler(os, encoding)); - case IGNORE: - return new DefaultHandler(); + return new BodyContentHandler(new OutputStreamWriter(os, encoding)); case TEXT: return new ToTextContentHandler(os, encoding); case HTML: Added: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java?rev=1634594&view=auto ============================================================================== --- tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java (added) +++ tika/trunk/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java Mon Oct 27 17:00:03 2014 @@ -0,0 +1,321 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.Set; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.junit.Test; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; +import org.xml.sax.helpers.DefaultHandler; + +import static junit.framework.Assert.assertFalse; +import static junit.framework.Assert.assertTrue; +import static org.junit.Assert.assertEquals; + +/** + * Test cases for the {@link org.apache.tika.sax.BodyContentHandler} class. + */ +public class BasicContentHandlerFactoryTest { + private static final String ENCODING = "UTF-8"; + //default max char len (at least in WriteOutContentHandler is 100k) + private static final int OVER_DEFAULT = 120000; + + @Test + public void testIgnore() throws Exception { + Parser p = new MockParser(OVER_DEFAULT); + ContentHandler handler = + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1).getNewContentHandler(); + assertTrue(handler instanceof DefaultHandler); + p.parse(null, handler, null, null); + assertTrue(handler.toString().contains("")); + + //tests that no write limit exception is thrown + p = new MockParser(100); + handler = + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5).getNewContentHandler(); + assertTrue(handler instanceof DefaultHandler); + p.parse(null, handler, null, null); + assertTrue(handler.toString().contains("")); + } + + @Test + public void testText() throws Exception { + Parser p = new MockParser(OVER_DEFAULT); + BasicContentHandlerFactory.HANDLER_TYPE type = + BasicContentHandlerFactory.HANDLER_TYPE.TEXT; + ContentHandler handler = + new BasicContentHandlerFactory(type, -1).getNewContentHandler(); + + assertTrue(handler instanceof ToTextContentHandler); + p.parse(null, handler, null, null); + assertTrue(handler.toString().contains("This is the title")); + assertTrue(handler.toString().contains("aaaaaaaaaa")); + assertFalse(handler.toString().toLowerCase().contains("<body")); + assertFalse(handler.toString().toLowerCase().contains("<html")); + assertTrue(handler.toString().length() > 110000); + //now test write limit + p = new MockParser(10); + handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); + assertTrue(handler instanceof WriteOutContentHandler); + assertWriteLimitReached(p, (WriteOutContentHandler) handler); + assertTrue(handler.toString().contains("This ")); + assertFalse(handler.toString().toLowerCase().contains("aaaa")); + + //now test outputstream call + p = new MockParser(OVER_DEFAULT); + ByteArrayOutputStream os = new ByteArrayOutputStream(); + handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING); + assertTrue(handler instanceof ToTextContentHandler); + p.parse(null, handler, null, null); + assertContains("This is the title", os.toByteArray()); + assertContains("aaaaaaaaaa", os.toByteArray()); + assertTrue(os.toByteArray().length > 110000); + assertNotContains("<body", os.toByteArray()); + assertNotContains("<html", os.toByteArray()); + + p = new MockParser(10); + os = new ByteArrayOutputStream(); + handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING); + assertTrue(handler instanceof WriteOutContentHandler); + assertWriteLimitReached(p, (WriteOutContentHandler) handler); + //When writing to an OutputStream and a write limit is reached, + //currently, nothing is written. + assertEquals(0, os.toByteArray().length); + } + + + @Test + public void testHTML() throws Exception { + Parser p = new MockParser(OVER_DEFAULT); + BasicContentHandlerFactory.HANDLER_TYPE type = + BasicContentHandlerFactory.HANDLER_TYPE.HTML; + ContentHandler handler = + new BasicContentHandlerFactory(type, -1).getNewContentHandler(); + + assertTrue(handler instanceof ToHTMLContentHandler); + p.parse(null, handler, null, null); + assertTrue(handler.toString().contains("<head><title>This is the title")); + assertTrue(handler.toString().contains("aaaaaaaaaa")); + assertTrue(handler.toString().length() > 110000); + + //now test write limit + p = new MockParser(10); + handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); + assertTrue(handler instanceof WriteOutContentHandler); + assertWriteLimitReached(p, (WriteOutContentHandler) handler); + assertTrue(handler.toString().contains("This ")); + assertFalse(handler.toString().toLowerCase().contains("aaaa")); + + //now test outputstream call + p = new MockParser(OVER_DEFAULT); + ByteArrayOutputStream os = new ByteArrayOutputStream(); + handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING); + assertTrue(handler instanceof ToHTMLContentHandler); + p.parse(null, handler, null, null); + assertContains("This is the title", os.toByteArray()); + assertContains("aaaaaaaaaa", os.toByteArray()); + assertContains("<body", os.toByteArray()); + assertContains("<html", os.toByteArray()); + assertTrue(os.toByteArray().length > 110000); + + + p = new MockParser(10); + os = new ByteArrayOutputStream(); + handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING); + assertTrue(handler instanceof WriteOutContentHandler); + assertWriteLimitReached(p, (WriteOutContentHandler) handler); + assertEquals(0, os.toByteArray().length); + } + + @Test + public void testXML() throws Exception { + Parser p = new MockParser(OVER_DEFAULT); + BasicContentHandlerFactory.HANDLER_TYPE type = + BasicContentHandlerFactory.HANDLER_TYPE.HTML; + ContentHandler handler = + new BasicContentHandlerFactory(type, -1).getNewContentHandler(); + + assertTrue(handler instanceof ToXMLContentHandler); + p.parse(null, handler, new Metadata(), null); + assertTrue(handler.toString().contains("<head><title>This is the title")); + assertTrue(handler.toString().contains("aaaaaaaaaa")); + assertTrue(handler.toString().length() > 110000); + + //now test write limit + p = new MockParser(10); + handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); + assertTrue(handler instanceof WriteOutContentHandler); + assertWriteLimitReached(p, (WriteOutContentHandler) handler); + assertTrue(handler.toString().contains("This ")); + assertFalse(handler.toString().toLowerCase().contains("aaaa")); + + //now test outputstream call + p = new MockParser(OVER_DEFAULT); + ByteArrayOutputStream os = new ByteArrayOutputStream(); + handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING); + assertTrue(handler instanceof ToXMLContentHandler); + p.parse(null, handler, null, null); + assertContains("This is the title", os.toByteArray()); + assertContains("aaaaaaaaaa", os.toByteArray()); + assertContains("<body", os.toByteArray()); + assertContains("<html", os.toByteArray()); + assertTrue(os.toByteArray().length > 110000); + + + p = new MockParser(10); + os = new ByteArrayOutputStream(); + handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING); + assertTrue(handler instanceof WriteOutContentHandler); + assertWriteLimitReached(p, (WriteOutContentHandler) handler); + assertEquals(0, os.toByteArray().length); + } + + + @Test + public void testBody() throws Exception { + Parser p = new MockParser(OVER_DEFAULT); + BasicContentHandlerFactory.HANDLER_TYPE type = + BasicContentHandlerFactory.HANDLER_TYPE.BODY; + ContentHandler handler = + new BasicContentHandlerFactory(type, -1).getNewContentHandler(); + + assertTrue(handler instanceof BodyContentHandler); + + p.parse(null, handler, null, null); + assertFalse(handler.toString().contains("title")); + assertTrue(handler.toString().contains("aaaaaaaaaa")); + assertTrue(handler.toString().length() > 110000); + + //now test write limit + p = new MockParser(10); + handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); + assertTrue(handler instanceof BodyContentHandler); + assertWriteLimitReached(p, (BodyContentHandler)handler); + assertFalse(handler.toString().contains("This ")); + assertTrue(handler.toString().toLowerCase().contains("aaaa")); + + //now test outputstream call + p = new MockParser(OVER_DEFAULT); + ByteArrayOutputStream os = new ByteArrayOutputStream(); + handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING); + assertTrue(handler instanceof BodyContentHandler); + p.parse(null, handler, null, null); + assertNotContains("title", os.toByteArray()); + assertContains("aaaaaaaaaa", os.toByteArray()); + assertNotContains("<body", os.toByteArray()); + assertNotContains("<html", os.toByteArray()); + assertTrue(os.toByteArray().length > 110000); + + p = new MockParser(10); + os = new ByteArrayOutputStream(); + handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING); + assertTrue(handler instanceof WriteOutContentHandler); + assertWriteLimitReached(p, (WriteOutContentHandler) handler); + assertEquals(0, os.toByteArray().length); + } + + private void assertWriteLimitReached(Parser p, WriteOutContentHandler handler) throws Exception { + boolean wlr = false; + try { + p.parse(null, handler, null, null); + } catch (SAXException e) { + if (! handler.isWriteLimitReached(e)) { + throw e; + } + wlr = true; + } + assertTrue("WriteLimitReached", wlr); + } + //TODO: is there a better way than to repeat this with diff signature? + private void assertWriteLimitReached(Parser p, BodyContentHandler handler) throws Exception { + boolean wlr = false; + try { + p.parse(null, handler, null, null); + } catch (SAXException e) { + if (! e.getClass().toString().contains("org.apache.tika.sax.WriteOutContentHandler$WriteLimitReachedException")){ + throw e; + } + + wlr = true; + } + assertTrue("WriteLimitReached", wlr); + } + + private void assertNotContains(String needle, byte[] hayStack) + throws UnsupportedEncodingException { + String s = new String(hayStack, ENCODING); + assertFalse(s.toLowerCase().contains(needle)); + } + + private void assertContains(String needle, byte[] hayStack) + throws UnsupportedEncodingException { + String s = new String(hayStack, ENCODING); + assertTrue(s.contains(needle)); + } + + //Simple mockparser that writes a title + //and charsToWrite number of 'a' + private class MockParser implements Parser { + private final String XHTML = "http://www.w3.org/1999/xhtml"; + private final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); + private final char[] TITLE = "This is the title".toCharArray(); + + private final int charsToWrite; + public MockParser(int charsToWrite) { + this.charsToWrite = charsToWrite; + } + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return null; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + handler.startDocument(); + handler.startPrefixMapping("", XHTML); + handler.startElement(XHTML, "html", "html", EMPTY_ATTRIBUTES); + handler.startElement(XHTML, "head", "head", EMPTY_ATTRIBUTES); + handler.startElement(XHTML, "title", "head", EMPTY_ATTRIBUTES); + handler.characters(TITLE, 0, TITLE.length); + handler.endElement(XHTML, "title", "head"); + + handler.endElement(XHTML, "head", "head"); + handler.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES); + char[] body = new char[charsToWrite]; + for (int i = 0; i < charsToWrite; i++) { + body[i] = 'a'; + } + handler.characters(body, 0, body.length); + handler.endElement(XHTML, "body", "body"); + handler.endElement(XHTML, "html", "html"); + handler.endDocument(); + } + } +}