Author: lehmi
Date: Wed Apr 28 17:25:02 2010
New Revision: 939013

URL: http://svn.apache.org/viewvc?rev=939013&view=rev
Log:
PDFBOX-276: avoid IOExceptions if the pdf isn't well-formed. Patch by Peter 
Lenahan (Peter_Lenahan at ibi dot com)

Modified:
    
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=939013&r1=939012&r2=939013&view=diff
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java 
(original)
+++ 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java 
Wed Apr 28 17:25:02 2010
@@ -469,6 +469,65 @@ public abstract class BaseParser
     }
 
     /**
+     * This is really a bug in the Document creators code, but it caused a 
crash
+     * in PDFBox, the first bug was in this format:
+     * /Title ( (5)
+     * /Creator which was patched in 1 place.
+     * However it missed the case where the Close Paren was escaped
+     * 
+     * The second bug was in this format 
+     * /Title (c:\)
+     * /Producer 
+     * 
+     * This patch  moves this code out of the parseCOSString method, so it can 
be used twice.
+     * 
+     * 
+     * @param bracesParameter the number of braces currently open.
+     * 
+     * @return the corrected value of the brace counter
+     * @throws IOException
+     */
+       private int checkForMissingCloseParen(final int bracesParameter) throws 
IOException {
+               int braces=bracesParameter;
+           byte[] nextThreeBytes = new byte[3];
+           int amountRead = pdfSource.read(nextThreeBytes);
+       
+           //lets handle the special case seen in Bull  River Rules and 
Regulations.pdf
+           //The dictionary looks like this
+           //    2 0 obj
+           //    <<
+           //        /Type /Info
+           //        /Creator (PaperPort http://www.scansoft.com)
+           //        /Producer (sspdflib 1.0 http://www.scansoft.com)
+           //        /Title ( (5)
+           //        /Author ()
+           //        /Subject ()
+           //
+           // Notice the /Title, the braces are not even but they should
+           // be.  So lets assume that if we encounter an this scenario
+           //   <end_brace><new_line><opening_slash> then that
+           // means that there is an error in the pdf and assume that
+           // was the end of the document.
+           //
+           if( amountRead == 3 )
+           {
+               if(( nextThreeBytes[0] == 0x0d &&  // Look for a carriage return
+                    nextThreeBytes[1] == 0x0a &&  // Look for a new line
+                    nextThreeBytes[2] == 0x2f ) || // Look for a slash / 
+                                                  // Add a second case without 
a new line
+                   (nextThreeBytes[0] == 0x0d &&  // Look for a carriage return
+                    nextThreeBytes[1] == 0x2f ))  // Look for a slash /
+               {
+                   braces = 0;
+               }
+           }
+           if (amountRead > 0) 
+        {
+            pdfSource.unread( nextThreeBytes, 0, amountRead );
+        }
+           return braces;
+       }
+    /**
      * This will parse a PDF string.
      *
      * @return The parsed PDF string.
@@ -505,46 +564,12 @@ public abstract class BaseParser
         {
             char ch = (char)c;
             int nextc = -2; // not yet read
-            //if( log.isDebugEnabled() )
-            //{
-            //    log.debug( "Parsing COSString character '" + c + "' code=" + 
(int)c );
-            //}
 
             if(ch == closeBrace)
             {
-                braces--;
-                byte[] nextThreeBytes = new byte[3];
-                int amountRead = pdfSource.read(nextThreeBytes);
-
-                //lets handle the special case seen in Bull  River Rules and 
Regulations.pdf
-                //The dictionary looks like this
-                //    2 0 obj
-                //    <<
-                //        /Type /Info
-                //        /Creator (PaperPort http://www.scansoft.com)
-                //        /Producer (sspdflib 1.0 http://www.scansoft.com)
-                //        /Title ( (5)
-                //        /Author ()
-                //        /Subject ()
-                //
-                // Notice the /Title, the braces are not even but they should
-                // be.  So lets assume that if we encounter an this scenario
-                //   <end_brace><new_line><opening_slash> then that
-                // means that there is an error in the pdf and assume that
-                // was the end of the document.
-                if( amountRead == 3 )
-                {
-                    if( nextThreeBytes[0] == 0x0d &&
-                            nextThreeBytes[1] == 0x0a &&
-                            nextThreeBytes[2] == 0x2f )
-                    {
-                        braces = 0;
-                    }
-                }
-                if (amountRead > 0) 
-                {
-                    pdfSource.unread( nextThreeBytes, 0, amountRead );
-                }
+               
+               braces--;
+               braces=checkForMissingCloseParen(braces);
                 if( braces != 0 )
                 {
                     retval.append( ch );
@@ -576,8 +601,18 @@ public abstract class BaseParser
                 case 'f':
                     retval.append( '\f' );
                     break;
-                case '(':
                 case ')':
+                       // PDFBox 276 /Title (c:\)
+                       braces=checkForMissingCloseParen(braces);
+                    if( braces != 0 )
+                    {
+                        retval.append( ch );
+                    }
+                    else {
+                       retval.append('\\');
+                    }
+                    break;
+                case '(':
                 case '\\':
                     retval.append( next );
                     break;


Reply via email to