Author: mikemccand
Date: Mon Oct 3 18:19:43 2011
New Revision: 1178491
URL: http://svn.apache.org/viewvc?rev=1178491&view=rev
Log:
TIKA-733: try to be robust when RTF doc has too many closing {'s vs opening }'s
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1178491&r1=1178490&r2=1178491&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Oct 3 18:19:43 2011
@@ -5,6 +5,9 @@ Release 0.11 - Current Development
* TIKA-632: Hyperlinks in RTF documents are now extracted as an <a
href=...>...</a> element.
+ * TIKA-733: Try to be robust when an RTF has too many closing {'s vs
+ opening {'s.
+
Release 0.10 - 09/25/2011
The most notable changes in Tika 0.10 over previous releases are:
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1178491&r1=1178490&r2=1178491&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
Mon Oct 3 18:19:43 2011
@@ -1023,34 +1023,39 @@ final class TextExtractor {
assert groupState.depth > 0;
ansiSkip = 0;
- // Restore group state:
- final GroupState outerGroupState = groupStates.removeLast();
-
- // Close italic, if outer does not have italic or
- // bold changed:
- if (groupState.italic) {
- if (!outerGroupState.italic ||
- groupState.bold != outerGroupState.bold) {
- end("i");
- groupState.italic = false;
+ // Be robust if RTF doc is corrupt (has too many
+ // closing }s):
+ // TODO: log a warning?
+ if (groupStates.size() > 0) {
+ // Restore group state:
+ final GroupState outerGroupState = groupStates.removeLast();
+
+ // Close italic, if outer does not have italic or
+ // bold changed:
+ if (groupState.italic) {
+ if (!outerGroupState.italic ||
+ groupState.bold != outerGroupState.bold) {
+ end("i");
+ groupState.italic = false;
+ }
}
- }
- // Close bold
- if (groupState.bold && !outerGroupState.bold) {
- end("b");
- }
+ // Close bold
+ if (groupState.bold && !outerGroupState.bold) {
+ end("b");
+ }
- // Open bold
- if (!groupState.bold && outerGroupState.bold) {
- start("b");
- }
+ // Open bold
+ if (!groupState.bold && outerGroupState.bold) {
+ start("b");
+ }
- // Open italic
- if (!groupState.italic && outerGroupState.italic) {
- start("i");
+ // Open italic
+ if (!groupState.italic && outerGroupState.italic) {
+ start("i");
+ }
+ groupState = outerGroupState;
}
- groupState = outerGroupState;
assert groupStates.size() == groupState.depth;
if (fieldState == 1) {