Author: jukka
Date: Thu Nov 27 16:30:48 2008
New Revision: 721317

URL: http://svn.apache.org/viewvc?rev=721317&view=rev
Log:
TIKA-171: New ContentHandler for plain text output that has no problem with 
missing white space after XHTML block tags

Patch by Uwe Schindler.

Added:
    
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLToTextContentHandler.java
Modified:
    lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
    lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java
    
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
    
lucene/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java

Modified: 
lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java?rev=721317&r1=721316&r2=721317&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java 
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java 
Thu Nov 27 16:30:48 2008
@@ -60,7 +60,7 @@
      * @param writer writer
      */
     public BodyContentHandler(Writer writer) {
-        this(new WriteOutContentHandler(writer));
+        this(new XHTMLToTextContentHandler(new 
WriteOutContentHandler(writer)));
     }
 
     /**
@@ -70,7 +70,7 @@
      * @param stream output stream
      */
     public BodyContentHandler(OutputStream stream) {
-        this(new WriteOutContentHandler(stream));
+        this(new XHTMLToTextContentHandler(new 
WriteOutContentHandler(stream)));
     }
 
     /**
@@ -79,7 +79,7 @@
      * using the [EMAIL PROTECTED] #toString()} method.
      */
     public BodyContentHandler() {
-        this(new WriteOutContentHandler());
+        this(new XHTMLToTextContentHandler(new WriteOutContentHandler()));
     }
 
 }

Modified: 
lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java?rev=721317&r1=721316&r2=721317&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java 
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/sax/TextContentHandler.java 
Thu Nov 27 16:30:48 2008
@@ -34,14 +34,21 @@
         this.delegate = delegate;
     }
 
+    @Override
     public void characters(char[] ch, int start, int length)
             throws SAXException {
         delegate.characters(ch, start, length);
     }
 
+    @Override
     public void ignorableWhitespace(char[] ch, int start, int length)
             throws SAXException {
         delegate.ignorableWhitespace(ch, start, length);
     }
 
+    @Override
+       public String toString() {
+        return delegate.toString();
+    }
+
 }

Added: 
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLToTextContentHandler.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLToTextContentHandler.java?rev=721317&view=auto
==============================================================================
--- 
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLToTextContentHandler.java
 (added)
+++ 
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLToTextContentHandler.java
 Thu Nov 27 16:30:48 2008
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.util.Arrays;
+import java.util.Set;
+import java.util.HashSet;
+
+import org.xml.sax.SAXException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Attributes;
+
+/**
+ * Content handler decorator that only passes the
+ * [EMAIL PROTECTED] #characters(char[], int, int)} and
+ * [EMAIL PROTECTED] #ignorableWhitespace(char[], int, int)} events to
+ * the decorated content handler.
+ * It additionally inserts a \n character at the end of each XHTML block 
element
+ * (</p>, </div>,...).
+ * This content handler should be used as delegate for [EMAIL PROTECTED] 
BodyContentHandler}.
+ */
+public class XHTMLToTextContentHandler extends TextContentHandler {
+
+    public XHTMLToTextContentHandler(ContentHandler handler) {
+        super(handler);
+    }
+
+    @Override
+    public void startElement(String namespaceURI, String localName, String 
qName, Attributes atts) throws SAXException {
+        if (
+                !"tr".equals(lastLocalName) && 
+                ("td".equals(localName) || "th".equals(localName))
+        ) characters(TAB,0,TAB.length);
+        lastLocalName=localName;
+    }
+
+    @Override
+    public void endElement(String namespaceURI, String localName, String 
qName) throws SAXException {
+        if (HTML_BLOCK_TAGS.contains(localName)) characters(NL,0,NL.length);
+    }
+
+    private String lastLocalName=null;
+
+    private static final char[] NL=new char[]{'\n'};
+    private static final char[] TAB=new char[]{'\t'};
+
+    // special XHTML tags that start new lines
+    private static final Set<String> HTML_BLOCK_TAGS=new 
HashSet<String>(Arrays.asList(
+            "p","div","fieldset","table","form",
+            "pre","blockquote","address",
+            "ul","ol","dl","li","dt","dd",
+            "h1","h2","h3","h4","h5","h6",
+            "noscript","noframes",
+            "hr","br","tr"
+    ));
+
+}

Modified: 
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java?rev=721317&r1=721316&r2=721317&view=diff
==============================================================================
--- 
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java 
(original)
+++ 
lucene/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java 
Thu Nov 27 16:30:48 2008
@@ -40,6 +40,7 @@
         assertEquals('e', reader.read());
         assertEquals('n', reader.read());
         assertEquals('t', reader.read());
+        assertEquals('\n', reader.read());
         assertEquals(-1, reader.read());
         reader.close();
         assertEquals(-1, stream.read());
@@ -61,6 +62,7 @@
         assertEquals('e', reader.read());
         assertEquals('n', reader.read());
         assertEquals('t', reader.read());
+        assertEquals('\n', reader.read());
         assertEquals(-1, reader.read());
         reader.close();
         assertEquals(-1, stream.read());

Modified: 
lucene/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=721317&r1=721316&r2=721317&view=diff
==============================================================================
--- 
lucene/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java 
(original)
+++ 
lucene/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java 
Thu Nov 27 16:30:48 2008
@@ -78,7 +78,7 @@
         parser.parse(
                 new ByteArrayInputStream(new byte[0]), handler, metadata);
         assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("", handler.toString());
+        assertEquals("\n", handler.toString());
     }
 
 }


Reply via email to