Author: jukka
Date: Wed Mar 24 13:08:43 2010
New Revision: 927044

URL: http://svn.apache.org/viewvc?rev=927044&view=rev
Log:
TIKA-392: RTF parser smashes words together in subsequent table cells

Add extra whitespace between subsequent text runs.

Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=927044&r1=927043&r2=927044&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
 Wed Mar 24 13:08:43 2010
@@ -24,6 +24,7 @@ import java.util.Set;
 import javax.swing.text.AttributeSet;
 import javax.swing.text.BadLocationException;
 import javax.swing.text.DefaultStyledDocument;
+import javax.swing.text.Document;
 import javax.swing.text.StyleContext;
 import javax.swing.text.rtf.RTFEditorKit;
 
@@ -53,8 +54,7 @@ public class RTFParser implements Parser
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
         try {
-            DefaultStyledDocument sd =
-                new DefaultStyledDocument(new NoReclaimStyleContext());
+            Document sd = new CustomStyledDocument();
             new RTFEditorKit().read(stream, sd, 0);
 
             XHTMLContentHandler xhtml =
@@ -77,6 +77,33 @@ public class RTFParser implements Parser
     }
 
     /**
+     * Customized version of {...@link DefaultStyledDocument}. Adds whitespace
+     * to places where words otherwise could have run together (see
+     * <a href="https://issues.apache.org/jira/browse/TIKA-392";>TIKA-392</a>),
+     * and works around the problem of Swing expecting a GUI environment (see
+     * <a href="https://issues.apache.org/jira/browse/TIKA-282";>TIKA-282</a>).
+     */
+    private static class CustomStyledDocument extends DefaultStyledDocument {
+
+        public CustomStyledDocument() {
+            super(new NoReclaimStyleContext());
+        }
+
+        @Override
+        public void insertString(
+                int offs, String str, AttributeSet a)
+        throws BadLocationException {
+            if (offs > 0 && offs == getLength()) {
+                super.insertString(offs, " ", a);
+                super.insertString(getLength(), str, a);
+            } else {
+                super.insertString(offs, str, a);
+            }
+        }
+
+    }
+
+    /**
      * A workaround to
      * <a href="https://issues.apache.org/jira/browse/TIKA-282";>TIKA-282</a>:
      * RTF parser expects a GUI environment. This class simply disables the
@@ -90,4 +117,5 @@ public class RTFParser implements Parser
         }
 
     }
+
 }


Reply via email to