Author: jukka
Date: Thu Nov 27 16:30:48 2008
New Revision: 721317
URL: http://svn.apache.org/viewvc?rev=721317&view=rev
Log:
TIKA-171: New ContentHandler for plain text output that has no problem with
missing white space after XHTML block tags
Patch by Uwe Schindler.
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLToTextContentHandler.java
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
lucene/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java?rev=721317&r1=721316&r2=721317&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
Thu Nov 27 16:30:48 2008
@@ -60,7 +60,7 @@
* @param writer writer
*/
public BodyContentHandler(Writer writer) {
- this(new WriteOutContentHandler(writer));
+ this(new XHTMLToTextContentHandler(new
WriteOutContentHandler(writer)));
}
/**
@@ -70,7 +70,7 @@
* @param stream output stream
*/
public BodyContentHandler(OutputStream stream) {
- this(new WriteOutContentHandler(stream));
+ this(new XHTMLToTextContentHandler(new
WriteOutContentHandler(stream)));
}
/**
@@ -79,7 +79,7 @@
* using the [EMAIL PROTECTED] #toString()} method.
*/
public BodyContentHandler() {
- this(new WriteOutContentHandler());
+ this(new XHTMLToTextContentHandler(new WriteOutContentHandler()));
}
}
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java?rev=721317&r1=721316&r2=721317&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java
Thu Nov 27 16:30:48 2008
@@ -34,14 +34,21 @@
this.delegate = delegate;
}
+ @Override
public void characters(char[] ch, int start, int length)
throws SAXException {
delegate.characters(ch, start, length);
}
+ @Override
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
delegate.ignorableWhitespace(ch, start, length);
}
+ @Override
+ public String toString() {
+ return delegate.toString();
+ }
+
}
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLToTextContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLToTextContentHandler.java?rev=721317&view=auto
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLToTextContentHandler.java
(added)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLToTextContentHandler.java
Thu Nov 27 16:30:48 2008
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.util.Arrays;
+import java.util.Set;
+import java.util.HashSet;
+
+import org.xml.sax.SAXException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Attributes;
+
+/**
+ * Content handler decorator that only passes the
+ * [EMAIL PROTECTED] #characters(char[], int, int)} and
+ * [EMAIL PROTECTED] #ignorableWhitespace(char[], int, int)} events to
+ * the decorated content handler.
+ * It additionally inserts a \n character at the end of each XHTML block
element
+ * (</p>, </div>,...).
+ * This content handler should be used as delegate for [EMAIL PROTECTED]
BodyContentHandler}.
+ */
+public class XHTMLToTextContentHandler extends TextContentHandler {
+
+ public XHTMLToTextContentHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ @Override
+ public void startElement(String namespaceURI, String localName, String
qName, Attributes atts) throws SAXException {
+ if (
+ !"tr".equals(lastLocalName) &&
+ ("td".equals(localName) || "th".equals(localName))
+ ) characters(TAB,0,TAB.length);
+ lastLocalName=localName;
+ }
+
+ @Override
+ public void endElement(String namespaceURI, String localName, String
qName) throws SAXException {
+ if (HTML_BLOCK_TAGS.contains(localName)) characters(NL,0,NL.length);
+ }
+
+ private String lastLocalName=null;
+
+ private static final char[] NL=new char[]{'\n'};
+ private static final char[] TAB=new char[]{'\t'};
+
+ // special XHTML tags that start new lines
+ private static final Set<String> HTML_BLOCK_TAGS=new
HashSet<String>(Arrays.asList(
+ "p","div","fieldset","table","form",
+ "pre","blockquote","address",
+ "ul","ol","dl","li","dt","dd",
+ "h1","h2","h3","h4","h5","h6",
+ "noscript","noframes",
+ "hr","br","tr"
+ ));
+
+}
Modified:
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java?rev=721317&r1=721316&r2=721317&view=diff
==============================================================================
---
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
(original)
+++
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
Thu Nov 27 16:30:48 2008
@@ -40,6 +40,7 @@
assertEquals('e', reader.read());
assertEquals('n', reader.read());
assertEquals('t', reader.read());
+ assertEquals('\n', reader.read());
assertEquals(-1, reader.read());
reader.close();
assertEquals(-1, stream.read());
@@ -61,6 +62,7 @@
assertEquals('e', reader.read());
assertEquals('n', reader.read());
assertEquals('t', reader.read());
+ assertEquals('\n', reader.read());
assertEquals(-1, reader.read());
reader.close();
assertEquals(-1, stream.read());
Modified:
lucene/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=721317&r1=721316&r2=721317&view=diff
==============================================================================
---
lucene/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
(original)
+++
lucene/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Thu Nov 27 16:30:48 2008
@@ -78,7 +78,7 @@
parser.parse(
new ByteArrayInputStream(new byte[0]), handler, metadata);
assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("", handler.toString());
+ assertEquals("\n", handler.toString());
}
}