Author: lehmi
Date: Sun Dec  8 17:56:28 2019
New Revision: 1871055

URL: http://svn.apache.org/viewvc?rev=1871055&view=rev
Log:
PDFBOX-3888: split method

Modified:
    
pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java

Modified: 
pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1871055&r1=1871054&r2=1871055&view=diff
==============================================================================
--- 
pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
 (original)
+++ 
pdfbox/branches/issue4569/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
 Sun Dec  8 17:56:28 2019
@@ -30,6 +30,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;
+import java.util.stream.Collectors;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -129,8 +130,6 @@ public class COSParser extends BaseParse
      */
     private static final char[] OBJ_STREAM = new char[] { '/', 'O', 'b', 'j', 
'S', 't', 'm' };
 
-    private long trailerOffset;
-    
     /**
      * file length.
      */
@@ -148,7 +147,6 @@ public class COSParser extends BaseParse
      * Contains all found objects of a brute force search.
      */
     private Map<COSObjectKey, Long> bfSearchCOSObjectKeyOffsets = null;
-    private Long lastEOFMarker = null;
     private List<Long> bfSearchXRefTablesOffsets = null;
     private List<Long> bfSearchXRefStreamsOffsets = null;
     private PDEncryption encryption = null;
@@ -764,43 +762,36 @@ public class COSParser extends BaseParse
         if (objstmBaseObj instanceof COSStream)
         {
             // parse object stream
-            PDFObjectStreamParser parser;
+            PDFObjectStreamParser parser = null;
             try
             {
                 parser = new PDFObjectStreamParser((COSStream) objstmBaseObj, 
document);
-            }
-            catch (IOException ex)
-            {
-                if (isLenient)
-                {
-                    LOG.error("object stream " + objstmObjNr + " could not be 
parsed due to an exception", ex);
-                    return;
-                }
-                else
-                {
-                    throw ex;
-                }
-            }
-
-            try
-            {
                 parser.parse();
             }
-            catch(IOException exception)
+            catch (IOException ex)
             {
                 if (isLenient)
                 {
-                    LOG.debug("Stop reading object stream "+objstmObjNr+" due 
to an exception", exception);
+                    if (parser == null)
+                    {
+                        LOG.error("object stream " + objstmObjNr
+                                + " could not be parsed due to an exception", 
ex);
+                    }
+                    else
+                    {
+                        LOG.debug("Stop reading object stream " + objstmObjNr
+                                + " due to an exception", ex);
+                    }
                     // the error is handled in parseDictObjects
                     return;
                 }
                 else
                 {
-                    throw exception;
+                    throw ex;
                 }
             }
             // register all objects which are referenced to be contained in 
object stream
-            for (COSObject next : parser.getObjects())
+            parser.getObjects().stream().forEach(next -> 
             {
                 COSObjectKey stmObjKey = new COSObjectKey(next);
                 Long offset = 
xrefTrailerResolver.getXrefTable().get(stmObjKey);
@@ -809,7 +800,7 @@ public class COSParser extends BaseParse
                     COSObject stmObj = document.getObjectFromPool(stmObjKey);
                     stmObj.setObject(next.getObject());
                 }
-            }
+            });
         }
     }
     
@@ -1322,7 +1313,7 @@ public class COSParser extends BaseParse
      */
     private void bfSearchForObjects() throws IOException
     {
-        bfSearchForLastEOFMarker();
+        long lastEOFMarker = bfSearchForLastEOFMarker();
         bfSearchCOSObjectKeyOffsets = new HashMap<>();
         long originOffset = source.getPosition();
         long currentOffset = MINIMUM_SEARCH_OFFSET;
@@ -1619,63 +1610,181 @@ public class COSParser extends BaseParse
      * 
      * @throws IOException if something went wrong
      */
-    private void bfSearchForLastEOFMarker() throws IOException
+    private long bfSearchForLastEOFMarker() throws IOException
     {
-        if (lastEOFMarker == null)
+        long lastEOFMarker = -1;
+        long originOffset = source.getPosition();
+        source.seek(MINIMUM_SEARCH_OFFSET);
+        while (!source.isEOF())
         {
-            long originOffset = source.getPosition();
-            source.seek(MINIMUM_SEARCH_OFFSET);
-            while (!source.isEOF())
+            // search for EOF marker
+            if (isString(EOF_MARKER))
             {
-                // search for EOF marker
-                if (isString(EOF_MARKER))
+                long tempMarker = source.getPosition();
+                source.seek(tempMarker + 5);
+                try
+                {
+                    // check if the following data is some valid pdf content
+                    // which most likely indicates that the pdf is linearized,
+                    // updated or just cut off somewhere in the middle
+                    skipSpaces();
+                    if (!isString(XREF_TABLE))
+                    {
+                        readObjectNumber();
+                        readGenerationNumber();
+                    }
+                }
+                catch (IOException exception)
+                {
+                    // save the EOF marker as the following data is most 
likely some garbage
+                    LOG.debug("An exception occured during brute force for 
last EOF - ignoring",
+                            exception);
+                    lastEOFMarker = tempMarker;
+                }
+            }
+            source.read();
+        }
+        source.seek(originOffset);
+        // no EOF marker found
+        if (lastEOFMarker == -1)
+        {
+            lastEOFMarker = Long.MAX_VALUE;
+        }
+        return lastEOFMarker;
+    }
+
+    /**
+     * Brute force search for all object streams.
+     * 
+     * @throws IOException if something went wrong
+     */
+    private void bfSearchForObjStreams() throws IOException
+    {
+        // save origin offset
+        long originOffset = source.getPosition();
+
+        // log warning about skipped stream
+        bfSearchForObjStreamOffsets().entrySet().stream() //
+                .filter(o -> bfSearchCOSObjectKeyOffsets.get(o.getValue()) == 
null) //
+                .forEach(o -> LOG.warn(
+                        "Skipped incomplete object stream:" + o.getValue() + " 
at " + o.getKey()));
+
+        // collect all stream offset
+        List<Long> objStreamOffsets = 
bfSearchForObjStreamOffsets().entrySet().stream() //
+                .filter(o -> bfSearchCOSObjectKeyOffsets.get(o.getValue()) != 
null) //
+                .filter(o -> 
o.getKey().equals(bfSearchCOSObjectKeyOffsets.get(o.getValue()))) //
+                .map(Map.Entry::getKey) //
+                .collect(Collectors.toList());
+        // add all found compressed objects to the brute force search result
+        for (Long offset : objStreamOffsets)
+        {
+            source.seek(offset);
+            long stmObjNumber = readObjectNumber();
+            int stmGenNumber = readGenerationNumber();
+            readExpectedString(OBJ_MARKER, true);
+            int nrOfObjects = 0;
+            byte[] numbersBytes = null;
+            COSStream stream = null;
+            COSInputStream is = null;
+            try
+            {
+                COSDictionary dict = parseCOSDictionary();
+                int offsetFirstStream = dict.getInt(COSName.FIRST);
+                nrOfObjects = dict.getInt(COSName.N);
+                // skip the stream if required values are missing
+                if (offsetFirstStream != -1 && nrOfObjects != -1)
+                {
+                    stream = parseCOSStream(dict);
+                    if (securityHandler != null)
+                    {
+                        securityHandler.decryptStream(stream, stmObjNumber, 
stmGenNumber);
+                    }
+                    is = stream.createInputStream();
+                    numbersBytes = new byte[offsetFirstStream];
+                    long isResult = is.read(numbersBytes);
+
+                    if (Long.compare(isResult, numbersBytes.length) != 0)
+                    {
+                        LOG.debug("Tried reading " + numbersBytes.length + " 
bytes but only "
+                                + isResult + " bytes read");
+                    }
+                }
+            }
+            catch (IOException exception)
+            {
+                LOG.debug("Skipped corrupt stream: (" + stmObjNumber + " 0 at 
offset " + offset,
+                        exception);
+            }
+            finally
+            {
+                if (is != null)
+                {
+                    is.close();
+                }
+                if (stream != null)
+                {
+                    stream.close();
+                }
+            }
+            if (numbersBytes != null)
+            {
+                // convert byte array to string, skip leading/trailing spaces, 
replace LF\double spaces
+                String[] numbers = new String(numbersBytes, 
StandardCharsets.ISO_8859_1) //
+                        .trim() //
+                        .replaceAll("\n", " ") //
+                        .replace("  ", " ") //
+                        .split(" ");
+                if (numbers.length < nrOfObjects * 2)
+                {
+                    LOG.debug(
+                            "Skipped corrupt stream: (" + stmObjNumber + " 0 
at offset " + offset);
+                    continue;
+                }
+                Map<COSObjectKey, Long> xrefOffset = 
xrefTrailerResolver.getXrefTable();
+                for (int i = 0; i < nrOfObjects; i++)
                 {
-                    long tempMarker = source.getPosition();
-                    source.seek(tempMarker + 5);
                     try
                     {
-                        // check if the following data is some valid pdf 
content
-                        // which most likely indicates that the pdf is 
linearized,
-                        // updated or just cut off somewhere in the middle
-                        skipSpaces();
-                        if (!isString(XREF_TABLE))
+                        long objNumber = Long.parseLong(numbers[i * 2]);
+                        COSObjectKey objKey = new COSObjectKey(objNumber, 0);
+                        Long existingOffset = 
bfSearchCOSObjectKeyOffsets.get(objKey);
+                        if (existingOffset != null && existingOffset < 0)
+                        {
+                            // translate stream object key to its offset
+                            COSObjectKey objStmKey = new 
COSObjectKey(Math.abs(existingOffset), 0);
+                            existingOffset = 
bfSearchCOSObjectKeyOffsets.get(objStmKey);
+                        }
+                        if (existingOffset == null || offset > existingOffset)
                         {
-                            readObjectNumber();
-                            readGenerationNumber();
+                            bfSearchCOSObjectKeyOffsets.put(objKey, 
-stmObjNumber);
+                            xrefOffset.put(objKey, -stmObjNumber);
                         }
                     }
-                    catch (IOException exception)
+                    catch (NumberFormatException exception)
                     {
-                        // save the EOF marker as the following data is most 
likely some garbage
-                        LOG.debug("An exception occured during brute force for 
last EOF - ignoring", exception);
-                        lastEOFMarker = tempMarker;
+                        LOG.debug("Skipped corrupt object key in stream: " + 
stmObjNumber);
                     }
                 }
-                source.read();
-            }
-            source.seek(originOffset);
-            // no EOF marker found
-            if (lastEOFMarker == null)
-            {
-                lastEOFMarker = Long.MAX_VALUE;
             }
         }
+        // restore origin offset
+        source.seek(originOffset);
     }
 
     /**
-     * Brute force search for all object streams.
+     * Search for all offsets of object streams within the given pdf
      * 
+     * @return a map of all offsets for object streams
      * @throws IOException if something went wrong
      */
-    private void bfSearchForObjStreams() throws IOException
+    private Map<Long, COSObjectKey> bfSearchForObjStreamOffsets() throws 
IOException
     {
         HashMap<Long, COSObjectKey> bfSearchObjStreamsOffsets = new 
HashMap<>();
-        long originOffset = source.getPosition();
         source.seek(MINIMUM_SEARCH_OFFSET);
         char[] string = " obj".toCharArray();
         while (!source.isEOF())
         {
-            // search for EOF marker
+            // search for object stream marker
             if (isString(OBJ_STREAM))
             {
                 long currentPosition = source.getPosition();
@@ -1715,8 +1824,8 @@ public class COSParser extends BaseParse
                                             newOffset = source.getPosition();
                                             long objNumber = 
readObjectNumber();
                                             int genNumber = 
readGenerationNumber();
-                                            COSObjectKey streamObjectKey = new 
COSObjectKey(objNumber,
-                                                    genNumber);
+                                            COSObjectKey streamObjectKey = new 
COSObjectKey(
+                                                    objNumber, genNumber);
                                             
bfSearchObjStreamsOffsets.put(newOffset,
                                                     streamObjectKey);
                                         }
@@ -1738,116 +1847,8 @@ public class COSParser extends BaseParse
             }
             source.read();
         }
-        // add all found compressed objects to the brute force search result
-        for (Entry<Long, COSObjectKey> streamOffsetsEntry : 
bfSearchObjStreamsOffsets.entrySet())
-        {
-            Long offset = streamOffsetsEntry.getKey();
-            Long bfOffset = 
bfSearchCOSObjectKeyOffsets.get(streamOffsetsEntry.getValue());
-            // incomplete object stream found?
-            if (bfOffset == null)
-            {
-                LOG.warn("Skipped incomplete object stream:" + 
streamOffsetsEntry.getValue()
-                        + " at " + offset);
-                continue;
-            }
-            // check if the object was overwritten
-            if (offset.equals(bfOffset))
-            {
-                source.seek(offset);
-                long stmObjNumber = readObjectNumber();
-                int stmGenNumber = readGenerationNumber();
-                readExpectedString(OBJ_MARKER, true);
-                int nrOfObjects = 0;
-                byte[] numbersBytes = null;
-                COSStream stream = null;
-                COSInputStream is = null;
-                try
-                {
-                    COSDictionary dict = parseCOSDictionary();
-                    int offsetFirstStream = dict.getInt(COSName.FIRST);
-                    nrOfObjects = dict.getInt(COSName.N);
-                    // skip the stream if required values are missing
-                    if (offsetFirstStream == -1 || nrOfObjects == -1)
-                    {
-                        continue;
-                    }
-                    stream = parseCOSStream(dict);
-                    if (securityHandler != null)
-                    {
-                        securityHandler.decryptStream(stream, stmObjNumber, 
stmGenNumber);
-                    }
-                    is = stream.createInputStream();
-                    numbersBytes = new byte[offsetFirstStream];
-                    long isResult = is.read(numbersBytes);
-
-                    if (Long.compare(isResult, numbersBytes.length) != 0)
-                    {
-                        LOG.debug("Tried reading " + numbersBytes.length + " 
bytes but only " + isResult + " bytes read");
-                    }
-                }
-                catch (IOException exception)
-                {
-                    LOG.debug(
-                            "Skipped corrupt stream: (" + stmObjNumber + " 0 
at offset " + offset, exception);
-                    continue;
-                }
-                finally
-                {
-                    if (is != null)
-                    {
-                        is.close();
-                    }
-                    if (stream != null)
-                    {
-                        stream.close();
-                    }
-                }
-                int start = 0;
-                // skip spaces
-                while (start < numbersBytes.length && numbersBytes[start] == 
32)
-                {
-                    start++;
-                }
-                String numbersStr = new String(numbersBytes, start, 
numbersBytes.length - start,
-                        StandardCharsets.ISO_8859_1);
-                numbersStr = numbersStr.replace('\n', ' ').replace("  ", " ");
-                String[] numbers = numbersStr.split(" ");
-                if (numbers.length < nrOfObjects * 2)
-                {
-                    LOG.debug(
-                            "Skipped corrupt stream: (" + stmObjNumber + " 0 
at offset " + offset);
-                    continue;
-                }
-                Map<COSObjectKey, Long> xrefOffset = 
xrefTrailerResolver.getXrefTable();
-                for (int i = 0; i < nrOfObjects; i++)
-                {
-                    try
-                    {
-                        long objNumber = Long.parseLong(numbers[i * 2]);
-                        COSObjectKey objKey = new COSObjectKey(objNumber, 0);
-                        Long existingOffset = 
bfSearchCOSObjectKeyOffsets.get(objKey);
-                        if (existingOffset != null && existingOffset < 0)
-                        {
-                            // translate stream object key to its offset
-                            COSObjectKey objStmKey = new 
COSObjectKey(Math.abs(existingOffset), 0);
-                            existingOffset = 
bfSearchCOSObjectKeyOffsets.get(objStmKey);
-                        }
-                        if (existingOffset == null || offset > existingOffset)
-                        {
-                            bfSearchCOSObjectKeyOffsets.put(objKey, 
-stmObjNumber);
-                            xrefOffset.put(objKey, -stmObjNumber);
-                        }
-                    }
-                    catch (NumberFormatException exception)
-                    {
-                        LOG.debug("Skipped corrupt object key in stream: " + 
stmObjNumber);
-                    }
-                }
-            }
-        }
-        source.seek(originOffset);
+        return bfSearchObjStreamsOffsets;
     }
-
     /**
      * Brute force search for all xref entries (tables).
      * 
@@ -2280,7 +2281,7 @@ public class COSParser extends BaseParse
     private boolean parseTrailer() throws IOException
     {
         // parse the last trailer.
-        trailerOffset = source.getPosition();
+        long trailerOffset = source.getPosition();
         // PDFBOX-1739 skip extra xref entries in RegisSTAR documents
         if (isLenient)
         {


Reply via email to