Author: mattmann
Date: Wed Aug 17 15:46:30 2011
New Revision: 1158779
URL: http://svn.apache.org/viewvc?rev=1158779&view=rev
Log:
- patch for TIKA-422 contributed by Mike McCandless.
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFWithCurlyBraces.rtf
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1158779&r1=1158778&r2=1158779&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Wed Aug 17 15:46:30 2011
@@ -102,6 +102,12 @@ public class RTFParserTest extends TikaT
assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", content);
}
+ public void testTextWithCurlyBraces() throws Exception {
+ String content = getText("testRTFWithCurlyBraces.rtf");
+ //assertContains("{ some text inside curly brackets }", content);
+ assertContains("{ some text inside curly brackets }", content);
+ }
+
private String getText(String filename) throws Exception {
File file = getResourceAsFile("/test-documents/" + filename);
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFWithCurlyBraces.rtf
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFWithCurlyBraces.rtf?rev=1158779&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFWithCurlyBraces.rtf
(added)
+++
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFWithCurlyBraces.rtf
Wed Aug 17 15:46:30 2011
@@ -0,0 +1,44 @@
+{\rtf1\ansi\ansicpg1251\uc1\deff1\stshfdbch0\stshfloch0\stshfhich0\stshfbi0\deflang1049\deflangfe1049{\fonttbl{\f0\froman\fcharset204\fprq2{\*\panose
02020603050405020304}Times New Roman;}
+{\f1\fswiss\fcharset204\fprq2{\*\panose
020b0604020202020204}Arial;}{\f2\fmodern\fcharset204\fprq1{\*\panose
02070309020205020404}Courier New;}{\f38\fswiss\fcharset204\fprq2{\*\panose
020b0604030504040204}Verdana;}
+{\f41\froman\fcharset0\fprq2 Times New Roman;}{\f39\froman\fcharset238\fprq2
Times New Roman CE;}{\f42\froman\fcharset161\fprq2 Times New Roman
Greek;}{\f43\froman\fcharset162\fprq2 Times New Roman Tur;}
+{\f44\froman\fcharset177\fprq2 Times New Roman
(Hebrew);}{\f45\froman\fcharset178\fprq2 Times New Roman
(Arabic);}{\f46\froman\fcharset186\fprq2 Times New Roman
Baltic;}{\f47\froman\fcharset163\fprq2 Times New Roman (Vietnamese);}
+{\f51\fswiss\fcharset0\fprq2 Arial;}{\f49\fswiss\fcharset238\fprq2 Arial
CE;}{\f52\fswiss\fcharset161\fprq2 Arial Greek;}{\f53\fswiss\fcharset162\fprq2
Arial Tur;}{\f54\fswiss\fcharset177\fprq2 Arial (Hebrew);}
+{\f55\fswiss\fcharset178\fprq2 Arial (Arabic);}{\f56\fswiss\fcharset186\fprq2
Arial Baltic;}{\f57\fswiss\fcharset163\fprq2 Arial
(Vietnamese);}{\f61\fmodern\fcharset0\fprq1 Courier
New;}{\f59\fmodern\fcharset238\fprq1 Courier New CE;}
+{\f62\fmodern\fcharset161\fprq1 Courier New
Greek;}{\f63\fmodern\fcharset162\fprq1 Courier New
Tur;}{\f64\fmodern\fcharset177\fprq1 Courier New
(Hebrew);}{\f65\fmodern\fcharset178\fprq1 Courier New (Arabic);}
+{\f66\fmodern\fcharset186\fprq1 Courier New
Baltic;}{\f67\fmodern\fcharset163\fprq1 Courier New
(Vietnamese);}{\f421\fswiss\fcharset0\fprq2
Verdana;}{\f419\fswiss\fcharset238\fprq2 Verdana
CE;}{\f422\fswiss\fcharset161\fprq2 Verdana Greek;}
+{\f423\fswiss\fcharset162\fprq2 Verdana Tur;}{\f426\fswiss\fcharset186\fprq2
Verdana Baltic;}{\f427\fswiss\fcharset163\fprq2 Verdana
(Vietnamese);}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;
+\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;
+\red192\green192\blue192;\red255\green255\blue255;}{\stylesheet{\qj
\fi720\li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0
\f1\fs20\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \snext0 Normal;}{
+\s1\qc \li0\ri0\sb108\sa108\nowidctlpar\faauto\outlinelevel0\rin0\lin0\itap0
\b\f1\fs20\cf9\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon0
\snext0 heading 1;}{\s2\qc
\li0\ri0\sb108\sa108\nowidctlpar\faauto\outlinelevel1\rin0\lin0\itap0
+\b\f1\fs20\cf9\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon1
\snext0 heading 2;}{\s3\qc
\li0\ri0\sb108\sa108\nowidctlpar\faauto\outlinelevel2\rin0\lin0\itap0
\b\f1\fs20\cf9\lang1049\langfe1049\cgrid\langnp1049\langfenp1049
+\sbasedon2 \snext0 heading 3;}{\s4\qc
\li0\ri0\sb108\sa108\nowidctlpar\faauto\outlinelevel3\rin0\lin0\itap0
\b\f1\fs20\cf9\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon3
\snext0 heading 4;}{\*\cs10 \additive \ssemihidden
+Default Paragraph
Font;}{\*\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv
+\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\fs20\lang1024\langfe1024\cgrid\langnp1024\langfenp1024 \snext11 \ssemihidden
Normal Table;}{\*\cs15 \additive \b\fs20\cf9
+\'d6\'e2\'e5\'f2\'ee\'e2\'ee\'e5
\'e2\'fb\'e4\'e5\'eb\'e5\'ed\'e8\'e5;}{\*\cs16 \additive \fs20\ul\cf11
\sbasedon15 \'c3\'e8\'ef\'e5\'f0\'f2\'e5\'ea\'f1\'f2\'ee\'e2\'e0\'ff
\'f1\'f1\'fb\'eb\'ea\'e0;}{
+\s17\qj \fi-2504\li1612\ri0\nowidctlpar\faauto\rin0\lin1612\itap0
\f1\fs20\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon0 \snext0
\'c7\'e0\'e3\'ee\'eb\'ee\'e2\'ee\'ea \'f1\'f2\'e0\'f2\'fc\'e8;}{
+\s18\ql \li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0
\f1\fs20\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon0 \snext0
\'d2\'e5\'ea\'f1\'f2 (\'eb\'e5\'e2. \'ef\'ee\'e4\'ef\'e8\'f1\'fc);}{\s19\ql
\li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0
+\f1\fs14\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon18 \snext0
\'ca\'ee\'eb\'ee\'ed\'f2\'e8\'f2\'f3\'eb (\'eb\'e5\'e2\'fb\'e9);}{\s20\qr
\li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0
+\f1\fs20\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon0 \snext0
\'d2\'e5\'ea\'f1\'f2 (\'ef\'f0\'e0\'e2. \'ef\'ee\'e4\'ef\'e8\'f1\'fc);}{\s21\qr
\li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0
+\f1\fs14\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon20 \snext0
\'ca\'ee\'eb\'ee\'ed\'f2\'e8\'f2\'f3\'eb (\'ef\'f0\'e0\'e2\'fb\'e9);}{\s22\qj
\fi-170\li170\ri0\nowidctlpar\faauto\rin0\lin170\itap0
+\i\f1\fs20\cf12\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon0
\snext0 \'ca\'ee\'ec\'ec\'e5\'ed\'f2\'e0\'f0\'e8\'e9;}{\s23\ql
\fi-170\li170\ri0\nowidctlpar\faauto\rin0\lin170\itap0
+\i\f1\fs20\cf9\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon22
\snext0 \'ca\'ee\'ec\'ec\'e5\'ed\'f2\'e0\'f0\'e8\'e9
\'ef\'ee\'eb\'fc\'e7\'ee\'e2\'e0\'f2\'e5\'eb\'ff;}{\*\cs24 \additive
\b\fs20\cf9 \sbasedon15
+\'cd\'e0\'e9\'e4\'e5\'ed\'ed\'fb\'e5 \'f1\'eb\'ee\'e2\'e0;}{\*\cs25 \additive
\fs20\cf10 \sbasedon15 \'cd\'e5 \'e2\'f1\'f2\'f3\'ef\'e8\'eb \'e2
\'f1\'e8\'eb\'f3;}{\s26\qj \li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0
+\f2\fs20\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon0 \snext0
\'d2\'e0\'e1\'eb\'e8\'f6\'fb
(\'ec\'ee\'ed\'ee\'f8\'e8\'f0\'e8\'ed\'ed\'fb\'e9);}{\s27\qj
\fi-140\li140\ri0\nowidctlpar\faauto\rin0\lin140\itap0
+\f2\fs20\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon26 \snext0
\'ce\'e3\'eb\'e0\'e2\'eb\'e5\'ed\'e8\'e5;}{\s28\qj
\fi720\li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0
\f38\fs18\lang1049\langfe1049\cgrid\langnp1049\langfenp1049
+\sbasedon0 \snext0 \'ce\'f1\'ed\'ee\'e2\'ed\'ee\'e5 \'ec\'e5\'ed\'fe;}{\s29\qj
\fi720\li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0
\f38\fs18\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon28 \snext0
+\'cf\'e5\'f0\'e5\'ec\'e5\'ed\'ed\'e0\'ff \'f7\'e0\'f1\'f2\'fc;}{\s30\qj
\fi720\li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0
\b\f38\fs18\ul\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon28
\snext0
+\'cf\'ee\'f1\'f2\'ee\'ff\'ed\'ed\'e0\'ff \'f7\'e0\'f1\'f2\'fc;}{\s31\ql
\li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0
\f1\fs20\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon0 \snext0
\'cf\'f0\'e8\'e6\'e0\'f2\'fb\'e9 \'e2\'eb\'e5\'e2\'ee;}{\*
+\cs32 \additive \fs20\ul\cf11 \sbasedon16
\'cf\'f0\'ee\'e4\'ee\'eb\'e6\'e5\'ed\'e8\'e5 \'f1\'f1\'fb\'eb\'ea\'e8;}{\s33\qj
\li0\ri118\nowidctlpar\faauto\rin118\lin0\itap0
\f1\fs20\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon0 \snext0
+\'d1\'eb\'ee\'e2\'e0\'f0\'ed\'e0\'ff \'f1\'f2\'e0\'f2\'fc\'ff;}{\s34\ql
\fi-170\li170\ri170\nowidctlpar\faauto\rin170\lin170\itap0
\f1\fs20\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 \sbasedon0 \snext0
+\'d2\'e5\'ea\'f1\'f2 (\'f1\'ef\'f0\'e0\'e2\'ea\'e0);}{\*\cs35 \additive
\strike\fs20\cf14 \sbasedon15 \'d3\'f2\'f0\'e0\'f2\'e8\'eb
\'f1\'e8\'eb\'f3;}}{\*\revtbl {Unknown;}}{\*\rsidtbl \rsid6824087}{\*\generator
Microsoft Word 10.0.2627;}{\info
+{\title \'cc\'e5\'f2\'ee\'e4\'e8\'ea\'e0}{\author kashina}{\operator
kashina}{\creatim\yr2006\mo6\dy13\hr13\min51}{\revtim\yr2006\mo6\dy13\hr13\min51}{\version2}{\edmins1}{\nofpages10}{\nofwords4564}{\nofchars26020}{\*\company
\'d1\'e8\'e1\'cd\'c0\'d6}
+{\nofcharsws30523}{\vern16437}}\paperw11906\paperh16838\margl1134\margr850
\widowctrl\ftnbj\aenddoc\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\hyphcaps0\horzdoc\dghspace120\dgvspace120\dghorigin1701\dgvorigin1984\dghshow0\dgvshow3
+\jcompress\viewkind1\viewscale100\nolnhtadjtbl\rsidroot6824087 \fet0\sectd
\linex0\sectdefaultcl\sftnbj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang
{\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang {\pntxta
.}}{\*\pnseclvl3
+\pndec\pnstart1\pnindent720\pnhang {\pntxta
.}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta
)}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta
)}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}
+{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta
)}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta
)}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta
)}}\pard\plain
+\s1\qc \li0\ri0\sb108\sa108\nowidctlpar\faauto\outlinelevel0\rin0\lin0\itap0
\b\f1\fs20\cf9\lang1049\langfe1049\cgrid\langnp1049\langfenp1049 {
+\insrsid6824087
+\par }
+{\lang1024\langfe1024\noproof\insrsid6824087 \{ some text inside curly
brackets \} }{
+\insrsid6824087
+\par }}