Author: jukka
Date: Sun Dec 7 11:10:04 2008
New Revision: 724176
URL: http://svn.apache.org/viewvc?rev=724176&view=rev
Log:
TIKA-179: Tika stand alone CLI --text output mostly not working, other output
formats are fine
Fixed the problem of text output to a byte stream (for example with
the --text option in the CLI) being lost or clipped due to a buffer not
being flushed. Added a test case for this in BodyContentHandlerTest.
Now the WriteOutContentHandler will explicitly flush the output writer
when an endDocument() event is received. I needed to modify a number of
other classes so that the startDocument() and endDocument() events are
correctly passed down to all handlers in the stack.
On the other hand, in some handlers we explicitly want to prevent the
start/endDocument calls. A good example are the PackageParser classes that
route the results of multiple parsers to a single output document. I added
an EmbeddedContentHandler decorator for explicitly preventing the document
events in such cases.
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java
lucene/tika/trunk/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java
lucene/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
lucene/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=724176&r1=724175&r2=724176&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Sun Dec 7 11:10:04 2008
@@ -103,11 +103,9 @@
new MatchingContentHandler(getMetaHandler(metadata), meta));
// Parse the HTML document
- xhtml.startDocument();
SAXParser parser = new SAXParser();
parser.setContentHandler(new XHTMLDowngradeHandler(handler));
parser.parse(new InputSource(Utils.getUTF8Reader(stream, metadata)));
- xhtml.endDocument();
}
private ContentHandler getTitleHandler(final Metadata metadata) {
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=724176&r1=724175&r2=724176&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
Sun Dec 7 11:10:04 2008
@@ -25,6 +25,7 @@
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -92,7 +93,7 @@
try {
getParser().parse(
new CloseShieldInputStream(stream),
- new BodyContentHandler(xhtml),
+ new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
metadata);
xhtml.characters("\n");
} catch (TikaException e) {
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java?rev=724176&view=auto
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java
(added)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java
Sun Dec 7 11:10:04 2008
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import org.xml.sax.ContentHandler;
+
+/**
+ * Content handler decorator that prevents the [EMAIL PROTECTED]
#startDocument()}
+ * and [EMAIL PROTECTED] #endDocument()} events from reaching the decorated
handler.
+ * This is useful when you want to direct the results of parsing multiple
+ * different XML documents into a single target document without worrying
+ * about the [EMAIL PROTECTED] #startDocument()} and [EMAIL PROTECTED]
#endDocument()} methods
+ * being called more than once.
+ */
+public class EmbeddedContentHandler extends ContentHandlerDecorator {
+
+ /**
+ * Created a decorator that prevents the given handler from
+ * receiving [EMAIL PROTECTED] #startDocument()} and [EMAIL PROTECTED]
#endDocument()}
+ * events.
+ *
+ * @param handler the content handler to be decorated
+ */
+ public EmbeddedContentHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ /**
+ * Ignored.
+ */
+ @Override
+ public void startDocument() {
+ }
+
+ /**
+ * Ignored.
+ */
+ @Override
+ public void endDocument() {
+ }
+
+}
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java?rev=724176&r1=724175&r2=724176&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java
Sun Dec 7 11:10:04 2008
@@ -23,7 +23,8 @@
/**
* Content handler decorator that only passes the
* [EMAIL PROTECTED] #characters(char[], int, int)} and
- * (@link [EMAIL PROTECTED] #ignorableWhitespace(char[], int, int)} events to
+ * (@link [EMAIL PROTECTED] #ignorableWhitespace(char[], int, int)}
+ * (plus [EMAIL PROTECTED] #startDocument()} and [EMAIL PROTECTED]
#endDocument()} events to
* the decorated content handler.
*/
public class TextContentHandler extends DefaultHandler {
@@ -47,6 +48,16 @@
}
@Override
+ public void startDocument() throws SAXException {
+ delegate.startDocument();
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ delegate.endDocument();
+ }
+
+ @Override
public String toString() {
return delegate.toString();
}
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java?rev=724176&r1=724175&r2=724176&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
Sun Dec 7 11:10:04 2008
@@ -79,6 +79,22 @@
}
/**
+ * Flushes the character stream so that no characters are forgotten
+ * in internal buffers.
+ *
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-179">TIKA-179</a>
+ * @throws SAXException if the stream can not be flushed
+ */
+ @Override
+ public void endDocument() throws SAXException {
+ try {
+ writer.flush();
+ } catch (IOException e) {
+ throw new SAXException("Error flushing character output", e);
+ }
+ }
+
+ /**
* Returns the contents of the internal string buffer where
* all the received characters have been collected. Only works
* when this object was constructed using the empty default
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java?rev=724176&r1=724175&r2=724176&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
Sun Dec 7 11:10:04 2008
@@ -26,9 +26,7 @@
/**
* Content handler decorator that only passes the elements, attributes,
- * and text nodes that match the given XPath expression. Note especially
- * that [EMAIL PROTECTED] #startDocument()} and [EMAIL PROTECTED]
#endDocument()} events are not
- * passed to the decorated handler.
+ * and text nodes that match the given XPath expression.
*/
public class MatchingContentHandler extends ContentHandlerDecorator {
@@ -102,16 +100,4 @@
}
}
- /**
- * Ignored.
- */
- public void startDocument() {
- }
-
- /**
- * Ignored.
- */
- public void endDocument() {
- }
-
}
Added:
lucene/tika/trunk/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java?rev=724176&view=auto
==============================================================================
---
lucene/tika/trunk/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java
(added)
+++
lucene/tika/trunk/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java
Sun Dec 7 11:10:04 2008
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.io.ByteArrayOutputStream;
+import java.io.OutputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Test cases for the [EMAIL PROTECTED] BodyContentHandler} class.
+ */
+public class BodyContentHandlerTest extends TestCase {
+
+ /**
+ * Test that the conversion to an [EMAIL PROTECTED] OutputStream} doesn't
leave
+ * characters unflushed in an internal buffer.
+ *
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-179">TIKA-179</a>
+ */
+ public void testOutputStream() throws Exception {
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(
+ new BodyContentHandler(buffer), new Metadata());
+ xhtml.startDocument();
+ xhtml.element("p", "Test text");
+ xhtml.endDocument();
+
+ assertEquals("Test text\n", buffer.toString());
+ }
+
+}