Author: mikemccand
Date: Tue Aug 30 19:07:45 2011
New Revision: 1163336
URL: http://svn.apache.org/viewvc?rev=1163336&view=rev
Log:
TIKA-392: add 3 RTF test cases
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFHexEscapeInsideWord.rtf
(with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFTableCellSeparation2.rtf
(with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFWindowsCodepage1250.rtf
(with props)
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1163336&r1=1163335&r2=1163336&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Tue Aug 30 19:07:45 2011
@@ -82,13 +82,31 @@ public class RTFParserTest extends TikaT
assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9
n\u00E1bo\u017Eensk\u00E9 texty", content);
}
+ public void testHexEscapeInsideWord() throws Exception {
+ String content = getText("testRTFHexEscapeInsideWord.rtf");
+ assertContains("ESPÃRITO", content);
+ }
+
+ public void testWindowsCodepage1250() throws Exception {
+ String content = getText("testRTFWindowsCodepage1250.rtf");
+ assertContains("zażóÅÄ gÄÅlÄ
jaźÅ", content);
+ assertContains("ZAÅ»ÃÅÄ GÄÅLÄ JAŹÅ", content);
+ }
+
public void testRTFTableCellSeparation() throws Exception {
String content = getText("testRTFTableCellSeparation.rtf");
-
+ // TODO: why do we insert extra whitespace...?
content = content.replaceAll("\\s+"," ");
assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
}
+ public void testRTFTableCellSeparation2() throws Exception {
+ String content = getText("testRTFTableCellSeparation2.rtf");
+ // TODO: why do we insert extra whitespace...?
+ content = content.replaceAll("\\s+"," ");
+ assertContains("Station Fax", content);
+ }
+
public void testGothic() throws Exception {
String content = getText("testRTFUnicodeGothic.rtf");
assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
content);
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFHexEscapeInsideWord.rtf
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFHexEscapeInsideWord.rtf?rev=1163336&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFHexEscapeInsideWord.rtf
(added)
+++
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFHexEscapeInsideWord.rtf
Tue Aug 30 19:07:45 2011
@@ -0,0 +1,4 @@
+{\rtf1\ansi\ansicpg1252\deff0\deflang1033{\fonttbl{\f0\fswiss\fcharset0
Arial;}}
+{\*\generator Msftedit 5.41.21.2500;}\viewkind4\uc1\pard\f0\fs20 GOVERNO DO
ESTADO DO ESP\'cdRITO SANTO\par
+}
+
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFHexEscapeInsideWord.rtf
------------------------------------------------------------------------------
svn:eol-style = native
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFTableCellSeparation2.rtf
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFTableCellSeparation2.rtf?rev=1163336&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFTableCellSeparation2.rtf
(added)
+++
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFTableCellSeparation2.rtf
Tue Aug 30 19:07:45 2011
@@ -0,0 +1,3 @@
+{\rtf1\ansi\ansicpg1252\deff0\deflang1033{\fonttbl{\f0\fswiss\fcharset0
Arial;}}
+{\rtlch\fcs1 \af0\afs24 \ltrch\fcs0
\f0\fs24\lang2055\langfe2055\langfenp2055\insrsid9461491\charrsid9461491 Fax /
Phone Station\cell Fax / Phone #\cell }
+}
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFTableCellSeparation2.rtf
------------------------------------------------------------------------------
svn:eol-style = native
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFWindowsCodepage1250.rtf
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFWindowsCodepage1250.rtf?rev=1163336&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFWindowsCodepage1250.rtf
(added)
+++
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFWindowsCodepage1250.rtf
Tue Aug 30 19:07:45 2011
@@ -0,0 +1,5 @@
+{\rtf1\ansi\ansicpg1250\deff0\deflang1045{\fonttbl{\f0\fswiss\fcharset238{\*\fname
Arial;}Arial CE;}}
+{\*\generator Msftedit 5.41.15.1515;}\viewkind4\uc1\pard\f0\fs20
za\'bf\'f3\'b3\'e6 g\'ea\'9cl\'b9 ja\'9f\'f1\par
+ZA\'af\'d3\'a3\'c6 G\'ca\'8cL\'a5 JA\'8f\'d1\par
+\par
+}
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFWindowsCodepage1250.rtf
------------------------------------------------------------------------------
svn:eol-style = native