Author: jukka
Date: Fri Mar 19 16:53:26 2010
New Revision: 925321
URL: http://svn.apache.org/viewvc?rev=925321&view=rev
Log:
TIKA-261: Ability to limit the amount of extracted text
Add a write limit feature to WriteOutContentHandler and use it in the
Tika.parseToString() methods.
The write limit is set by default to 100k characters when buffering to memory,
and disabled when streaming.
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=925321&r1=925320&r2=925321&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
(original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Fri Mar
19 16:53:26 2010
@@ -33,7 +33,7 @@ import org.apache.tika.parser.ParseConte
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParsingReader;
import org.apache.tika.sax.BodyContentHandler;
-import org.xml.sax.ContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
import org.xml.sax.SAXException;
/**
@@ -58,6 +58,13 @@ public class Tika {
private final Parser parser;
/**
+ * Maximum length of the strings returned by the parseToString methods.
+ * Used to prevent out of memory problems with huge input documents.
+ * The default setting is 100k characters.
+ */
+ private int maxStringLength = 100 * 1000;
+
+ /**
* Creates a Tika facade using the given configuration.
*
* @param config Tika configuration
@@ -238,6 +245,11 @@ public class Tika {
/**
* Parses the given document and returns the extracted text content.
* The given input stream is closed by this method.
+ * <p>
+ * To avoid unpredictable excess memory use, the returned string contains
+ * only up to {...@link #getMaxStringLength()} first characters extracted
+ * from the input document. Use the {...@link #setMaxStringLength(int)}
+ * method to adjust this limitation.
*
* @param stream the document to be parsed
* @param metadata document metadata
@@ -247,23 +259,32 @@ public class Tika {
*/
public String parseToString(InputStream stream, Metadata metadata)
throws IOException, TikaException {
+ WriteOutContentHandler handler =
+ new WriteOutContentHandler(maxStringLength);
try {
- ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
- parser.parse(stream, handler, metadata, context);
- return handler.toString();
+ parser.parse(
+ stream, new BodyContentHandler(handler), metadata,
context);
} catch (SAXException e) {
- // This should never happen with BodyContentHandler...
- throw new TikaException("Unexpected SAX processing failure", e);
+ if (!handler.isWriteLimitReached(e)) {
+ // This should never happen with BodyContentHandler...
+ throw new TikaException("Unexpected SAX processing failure",
e);
+ }
} finally {
stream.close();
}
+ return handler.toString();
}
/**
* Parses the given document and returns the extracted text content.
* The given input stream is closed by this method.
+ * <p>
+ * To avoid unpredictable excess memory use, the returned string contains
+ * only up to {...@link #getMaxStringLength()} first characters extracted
+ * from the input document. Use the {...@link #setMaxStringLength(int)}
+ * method to adjust this limitation.
*
* @param stream the document to be parsed
* @return extracted text content
@@ -277,6 +298,11 @@ public class Tika {
/**
* Parses the given file and returns the extracted text content.
+ * <p>
+ * To avoid unpredictable excess memory use, the returned string contains
+ * only up to {...@link #getMaxStringLength()} first characters extracted
+ * from the input document. Use the {...@link #setMaxStringLength(int)}
+ * method to adjust this limitation.
*
* @param file the file to be parsed
* @return extracted text content
@@ -290,6 +316,11 @@ public class Tika {
/**
* Parses the resource at the given URL and returns the extracted
* text content.
+ * <p>
+ * To avoid unpredictable excess memory use, the returned string contains
+ * only up to {...@link #getMaxStringLength()} first characters extracted
+ * from the input document. Use the {...@link #setMaxStringLength(int)}
+ * method to adjust this limitation.
*
* @param url the URL of the resource to be parsed
* @return extracted text content
@@ -302,4 +333,27 @@ public class Tika {
return parseToString(stream, metadata);
}
+ /**
+ * Returns the maximum length of strings returned by the
+ * parseToString methods.
+ *
+ * @since Apache Tika 0.7
+ * @return maximum string length, or -1 if the limit has been disabled
+ */
+ public int getMaxStringLength() {
+ return maxStringLength;
+ }
+
+ /**
+ * Sets the maximum length of strings returned by the parseToString
+ * methods.
+ *
+ * @since Apache Tika 0.7
+ * @param maxStringLength maximum string length,
+ * or -1 to disable this limit
+ */
+ public void setMaxStringLength(int maxStringLength) {
+ this.maxStringLength = maxStringLength;
+ }
+
}
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java?rev=925321&r1=925320&r2=925321&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java
(original)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java
Fri Mar 19 16:53:26 2010
@@ -23,6 +23,7 @@ import org.apache.tika.sax.xpath.Matcher
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* Content handler decorator that only passes everything inside
@@ -77,6 +78,25 @@ public class BodyContentHandler extends
* Creates a content handler that writes XHTML body character events to
* an internal string buffer. The contents of the buffer can be retrieved
* using the {...@link #toString()} method.
+ * <p>
+ * The internal string buffer is bounded at the given number of characters.
+ * If this write limit is reached, then a {...@link SAXException} is
thrown.
+ *
+ * @since Apache Tika 0.7
+ * @param writeLimit maximum number of characters to include in the string,
+ * or -1 to disable the write limit
+ */
+ public BodyContentHandler(int writeLimit) {
+ this(new WriteOutContentHandler(writeLimit));
+ }
+
+ /**
+ * Creates a content handler that writes XHTML body character events to
+ * an internal string buffer. The contents of the buffer can be retrieved
+ * using the {...@link #toString()} method.
+ * <p>
+ * The internal string buffer is bounded at 100k characters. If this write
+ * limit is reached, then a {...@link SAXException} is thrown.
*/
public BodyContentHandler() {
this(new WriteOutContentHandler());
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java?rev=925321&r1=925320&r2=925321&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
(original)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
Fri Mar 19 16:53:26 2010
@@ -37,13 +37,29 @@ public class WriteOutContentHandler exte
private final Writer writer;
/**
+ * The maximum number of characters to write to the character stream.
+ * Set to -1 for no limit.
+ */
+ private final int writeLimit;
+
+ /**
+ * Number of characters written so far.
+ */
+ private int writeCount = 0;
+
+ private WriteOutContentHandler(Writer writer, int writeLimit) {
+ this.writer = writer;
+ this.writeLimit = writeLimit;
+ }
+
+ /**
* Creates a content handler that writes character events to
* the given writer.
*
* @param writer writer
*/
public WriteOutContentHandler(Writer writer) {
- this.writer = writer;
+ this(writer, -1);
}
/**
@@ -60,9 +76,32 @@ public class WriteOutContentHandler exte
* Creates a content handler that writes character events
* to an internal string buffer. Use the {...@link #toString()}
* method to access the collected character content.
+ * <p>
+ * The internal string buffer is bounded at the given number of characters.
+ * If this write limit is reached, then a {...@link SAXException} is
thrown.
+ * The {...@link #isWriteLimitReached(Throwable)} method can be used to
+ * detect this case.
+ *
+ * @since Apache Tika 0.7
+ * @param writeLimit maximum number of characters to include in the string,
+ * or -1 to disable the write limit
+ */
+ public WriteOutContentHandler(int writeLimit) {
+ this(new StringWriter(), writeLimit);
+ }
+
+ /**
+ * Creates a content handler that writes character events
+ * to an internal string buffer. Use the {...@link #toString()}
+ * method to access the collected character content.
+ * <p>
+ * The internal string buffer is bounded at 100k characters. If this
+ * write limit is reached, then a {...@link SAXException} is thrown. The
+ * {...@link #isWriteLimitReached(Throwable)} method can be used to detect
+ * this case.
*/
public WriteOutContentHandler() {
- this(new StringWriter());
+ this(100 * 1000);
}
/**
@@ -72,7 +111,14 @@ public class WriteOutContentHandler exte
public void characters(char[] ch, int start, int length)
throws SAXException {
try {
- writer.write(ch, start, length);
+ if (writeLimit == -1 || writeCount + length <= writeLimit) {
+ writer.write(ch, start, length);
+ writeCount += length;
+ } else {
+ writer.write(ch, start, writeLimit - writeCount);
+ writeCount = writeLimit;
+ throw new WriteLimitReachedException();
+ }
} catch (IOException e) {
throw new SAXException("Error writing out character content", e);
}
@@ -85,11 +131,7 @@ public class WriteOutContentHandler exte
@Override
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
- try {
- writer.write(ch, start, length);
- } catch (IOException e) {
- throw new SAXException("Error writing out character content", e);
- }
+ characters(ch, start, length);
}
/**
@@ -120,4 +162,32 @@ public class WriteOutContentHandler exte
return writer.toString();
}
+ /**
+ * Checks whether the given exception (or any of it's root causes) was
+ * thrown by this handler as a signal of reaching the write limit.
+ *
+ * @since Apache Tika 0.7
+ * @param t throwable
+ * @return <code>true</code> if the write limit was reached,
+ * <code>false</code> otherwise
+ */
+ public boolean isWriteLimitReached(Throwable t) {
+ if (t instanceof WriteLimitReachedException) {
+ return this == ((WriteLimitReachedException) t).getSource();
+ } else {
+ return t.getCause() != null && isWriteLimitReached(t.getCause());
+ }
+ }
+
+ /**
+ * The exception used as a signal when the write limit has been reached.
+ */
+ private class WriteLimitReachedException extends SAXException {
+
+ public WriteOutContentHandler getSource() {
+ return WriteOutContentHandler.this;
+ }
+
+ }
+
}
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java?rev=925321&r1=925320&r2=925321&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
Fri Mar 19 16:53:26 2010
@@ -181,7 +181,7 @@ public class AutoDetectParserTest extend
"/test-documents/TIKA-216.tgz");
try {
Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
+ ContentHandler handler = new BodyContentHandler(-1);
new AutoDetectParser().parse(tgz, handler, metadata);
fail("Zip bomb was not detected");
} catch (TikaException e) {