Author: mikemccand
Date: Mon Oct  3 18:19:43 2011
New Revision: 1178491

URL: http://svn.apache.org/viewvc?rev=1178491&view=rev
Log:
TIKA-733: try to be robust when RTF doc has too many closing {'s vs opening }'s

Modified:
    tika/trunk/CHANGES.txt
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1178491&r1=1178490&r2=1178491&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Oct  3 18:19:43 2011
@@ -5,6 +5,9 @@ Release 0.11 - Current Development
  * TIKA-632: Hyperlinks in RTF documents are now extracted as an <a
    href=...>...</a> element.
 
+ * TIKA-733: Try to be robust when an RTF has too many closing {'s vs
+   opening {'s.
+
 Release 0.10 - 09/25/2011
 
 The most notable changes in Tika 0.10 over previous releases are:

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1178491&r1=1178490&r2=1178491&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
 Mon Oct  3 18:19:43 2011
@@ -1023,34 +1023,39 @@ final class TextExtractor {
         assert groupState.depth > 0;
         ansiSkip = 0;
 
-        // Restore group state:
-        final GroupState outerGroupState = groupStates.removeLast();
-
-        // Close italic, if outer does not have italic or
-        // bold changed:
-        if (groupState.italic) {
-            if (!outerGroupState.italic ||
-                groupState.bold != outerGroupState.bold) {
-                end("i");
-                groupState.italic = false;
+        // Be robust if RTF doc is corrupt (has too many
+        // closing }s):
+        // TODO: log a warning?
+        if (groupStates.size() > 0) {
+            // Restore group state:
+            final GroupState outerGroupState = groupStates.removeLast();
+
+            // Close italic, if outer does not have italic or
+            // bold changed:
+            if (groupState.italic) {
+                if (!outerGroupState.italic ||
+                    groupState.bold != outerGroupState.bold) {
+                    end("i");
+                    groupState.italic = false;
+                }
             }
-        }
 
-        // Close bold
-        if (groupState.bold && !outerGroupState.bold) {
-            end("b");
-        }
+            // Close bold
+            if (groupState.bold && !outerGroupState.bold) {
+                end("b");
+            }
 
-        // Open bold
-        if (!groupState.bold && outerGroupState.bold) {
-            start("b");
-        }
+            // Open bold
+            if (!groupState.bold && outerGroupState.bold) {
+                start("b");
+            }
 
-        // Open italic
-        if (!groupState.italic && outerGroupState.italic) {
-            start("i");
+            // Open italic
+            if (!groupState.italic && outerGroupState.italic) {
+                start("i");
+            }
+            groupState = outerGroupState;
         }
-        groupState = outerGroupState;
         assert groupStates.size() == groupState.depth;
 
         if (fieldState == 1) {


Reply via email to