Author: tallison
Date: Wed Feb 11 12:59:03 2015
New Revision: 1658947

URL: http://svn.apache.org/r1658947
Log:
TIKA-1544 consecutive new lines not preserved in rtf

Added:
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFNewlines.rtf
Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1658947&r1=1658946&r2=1658947&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
 Wed Feb 11 12:59:03 2015
@@ -622,6 +622,10 @@ final class TextExtractor {
 
     private void endParagraph(boolean preserveStyles) throws IOException, 
SAXException, TikaException {
         pushText();
+        //maintain consecutive new lines
+        if (!inParagraph) {
+            lazyStartParagraph();
+        }
         if (inParagraph) {
             if (groupState.italic) {
                 end("i");

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1658947&r1=1658946&r2=1658947&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
 Wed Feb 11 12:59:03 2015
@@ -16,6 +16,22 @@
  */
 package org.apache.tika.parser.rtf;
 
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
 import org.apache.tika.Tika;
 import org.apache.tika.TikaTest;
 import org.apache.tika.extractor.ContainerExtractor;
@@ -38,22 +54,6 @@ import org.apache.tika.sax.WriteOutConte
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
 
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
 /**
  * Junit test class for the Tika {@link RTFParser}
  */
@@ -547,7 +547,24 @@ public class RTFParserTest extends TikaT
         assertEquals(40, meta_jpg.names().length);
         assertEquals(105, meta_jpg.names().length);
     }
-    
+
+    @Test
+    public void testMultipleNewlines() throws Exception {
+        String content = getXML("testRTFNewlines.rtf").xml;
+        content = content.replaceAll("[\r\n]+", " ");
+        assertContains("<body><p>one</p> " +
+                "<p /> " +
+                "<p>two</p> " +
+                "<p /> " +
+                "<p /> " +
+                "<p>three</p> " +
+                "<p /> " +
+                "<p /> " +
+                "<p /> " +
+                "<p>four</p>", content);
+    }
+
+
     //TIKA-1010 test linked embedded doc
     @Test
     public void testEmbeddedLinkedDocument() throws Exception {

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFNewlines.rtf
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFNewlines.rtf?rev=1658947&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFNewlines.rtf 
(added)
+++ 
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFNewlines.rtf 
Wed Feb 11 12:59:03 2015
@@ -0,0 +1,27 @@
+{\rtf1\ansi\ansicpg1252\uc1\deff0\stshfdbch0\stshfloch0\stshfhich0\stshfbi0\deflang1036\deflangfe1036{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose
 02020603050405020304}Times New Roman;}{\f37\froman\fcharset238\fprq2 Times New 
Roman CE;}
+{\f38\froman\fcharset204\fprq2 Times New Roman 
Cyr;}{\f40\froman\fcharset161\fprq2 Times New Roman 
Greek;}{\f41\froman\fcharset162\fprq2 Times New Roman 
Tur;}{\f42\froman\fcharset177\fprq2 Times New Roman (Hebrew);}
+{\f43\froman\fcharset178\fprq2 Times New Roman 
(Arabic);}{\f44\froman\fcharset186\fprq2 Times New Roman 
Baltic;}{\f45\froman\fcharset163\fprq2 Times New Roman 
(Vietnamese);}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;
+\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;
+\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\ql 
\li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 
\fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 \snext0 
Normal;}{\*\cs10 \additive \ssemihidden 
+Default Paragraph 
Font;}{\*\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv
 
+\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 
\fs20\lang1024\langfe1024\cgrid\langnp1024\langfenp1024 \snext11 \ssemihidden 
Normal Table;}}{\*\latentstyles\lsdstimax156\lsdlockeddef0}{\*\rsidtbl 
\rsid2954171\rsid10375891}
+{\*\generator Microsoft Word 11.0.6568;}{\info{\title Test d\'92indexation 
Word}{\author Bibliotheque}{\operator 
Bibliotheque}{\creatim\yr2006\mo5\dy18\hr12\min19}{\revtim\yr2006\mo5\dy18\hr12\min19}{\version2}{\edmins0}{\nofpages1}{\nofwords3}
+{\nofchars21}{\*\company Universite 
Laval}{\nofcharsws23}{\vern24579}}\paperw11906\paperh16838\margl1417\margr1417\margt1417\margb1417
 
+\deftab708\widowctrl\ftnbj\aenddoc\hyphhotz425\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace180\dgvspace180\dghorigin1417\dgvorigin1417\dghshow1\dgvshow1
+\jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule\nobrkwrptbl\snaptogridincell\allowfieldendsel\wrppunct\asianbrkrule\nojkernpunct\rsidroot2954171
 \fet0
+\sectd 
\linex0\headery708\footery708\colsx708\endnhere\sectlinegrid360\sectdefaultcl\sftnbj
 {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta 
.}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl3
+\pndec\pnstart1\pnindent720\pnhang {\pntxta 
.}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta 
)}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta 
)}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}
+{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta 
)}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta 
)}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta 
)}}\pard\plain 
+\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 
\fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid2954171
+one
+\par
+\par two
+\par 
+\par 
+\par three
+\par 
+\par 
+\par 
+\par four
+\par 
+}}
\ No newline at end of file


Reply via email to