Author: tilman
Date: Tue Sep 19 19:05:48 2017
New Revision: 1808931

URL: http://svn.apache.org/viewvc?rev=1808931&view=rev
Log:
PDFBOX-3933: don't swallow CR at end of stream if there is one at the 
beginning; add testcase from issue 2079

Added:
    
pdfbox/branches/1.8/pdfbox/src/test/resources/org/apache/pdfbox/pdfparser/embedded_zip.pdf
   (with props)
Modified:
    
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
    
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java
    
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
    
pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/EndstreamOutputStreamTest.java

Modified: 
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1808931&r1=1808930&r2=1808931&view=diff
==============================================================================
--- 
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
 (original)
+++ 
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
 Tue Sep 19 19:05:48 2017
@@ -419,6 +419,9 @@ public abstract class BaseParser
             {
                 throw new IOException("expected='stream' actual='" + 
streamString + "' at offset " + pdfSource.getOffset());
             }
+            // Flag indicating whether a CR character was encountered after 
the "stream" keyword, in which case we expect it again before "endstream" 
+            boolean hasCR = false;
+
 
             //PDF Ref 3.2.7 A stream must be followed by either
             //a CRLF or LF but nothing else.
@@ -435,6 +438,7 @@ public abstract class BaseParser
 
             if( whitespace == 0x0D )
             {
+                hasCR = true;
                 whitespace = pdfSource.read();
                 if( whitespace != 0x0A )
                 {
@@ -487,7 +491,7 @@ public abstract class BaseParser
             {
                 // Couldn't determine length from dict: just
                 // scan until we find endstream:
-                readUntilEndStream(new EndstreamOutputStream(out));
+                readUntilEndStream(new EndstreamOutputStream(out, hasCR));
             }
             else
             {
@@ -564,7 +568,7 @@ public abstract class BaseParser
                         IOUtils.closeQuietly(out);
                         out = stream.createFilteredStream( streamLength );
                         // scan until we find endstream:
-                        readUntilEndStream(new EndstreamOutputStream(out));
+                        readUntilEndStream(new EndstreamOutputStream(out, 
hasCR));
                     }
                 }
             }
@@ -602,7 +606,7 @@ public abstract class BaseParser
                      * If for some reason we get something else here, Read 
until we find the next
                      * "endstream"
                      */
-                    readUntilEndStream(new EndstreamOutputStream(out));
+                    readUntilEndStream(new EndstreamOutputStream(out, hasCR));
                     endStream = readString();
                     if( !endStream.equals( ENDSTREAM_STRING ) )
                     {

Modified: 
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java?rev=1808931&r1=1808930&r2=1808931&view=diff
==============================================================================
--- 
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java
 (original)
+++ 
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java
 Tue Sep 19 19:05:48 2017
@@ -39,9 +39,13 @@ class EndstreamOutputStream extends Buff
     private int pos = 0;
     private boolean mustFilter = true;
 
-    public EndstreamOutputStream(OutputStream out)
+    // Flag indicating whether we search for CR+LF or for LF only
+    private final boolean searchCR;
+
+    public EndstreamOutputStream(OutputStream out, boolean searchCR)
     {
         super(out);
+        this.searchCR = searchCR;
     }
 
     /**
@@ -96,7 +100,7 @@ class EndstreamOutputStream extends Buff
             // don't write CR, LF, or CR LF if at the end of the buffer
             if (len > 0)
             {
-                if (b[off + len - 1] == '\r')
+                if (searchCR && b[off + len - 1] == '\r')
                 {
                     hasCR = true;
                     --len;
@@ -105,7 +109,7 @@ class EndstreamOutputStream extends Buff
                 {
                     hasLF = true;
                     --len;
-                    if (len > 0 && b[off + len - 1] == '\r')
+                    if (searchCR && len > 0 && b[off + len - 1] == '\r')
                     {
                         hasCR = true;
                         --len;

Modified: 
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1808931&r1=1808930&r2=1808931&view=diff
==============================================================================
--- 
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
 (original)
+++ 
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
 Tue Sep 19 19:05:48 2017
@@ -1748,8 +1748,11 @@ public class NonSequentialPDFParser exte
                 whitespace = pdfSource.read();
             }
 
+            boolean hasCR = false;
+
             if (whitespace == 0x0D)
             {
+                hasCR = true;
                 whitespace = pdfSource.read();
                 if (whitespace != 0x0A)
                 {
@@ -1811,7 +1814,7 @@ public class NonSequentialPDFParser exte
             if (useReadUntilEnd)
             {
                 out = stream.createFilteredStream();
-                readUntilEndStream(new EndstreamOutputStream(out));
+                readUntilEndStream(new EndstreamOutputStream(out, hasCR));
             }
             String endStream = readString();
             if (endStream.equals("endobj") && isLenient)

Modified: 
pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/EndstreamOutputStreamTest.java
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/EndstreamOutputStreamTest.java?rev=1808931&r1=1808930&r2=1808931&view=diff
==============================================================================
--- 
pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/EndstreamOutputStreamTest.java
 (original)
+++ 
pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/EndstreamOutputStreamTest.java
 Tue Sep 19 19:05:48 2017
@@ -17,8 +17,21 @@
 package org.apache.pdfbox.pdfparser;
 
 import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Map;
 import junit.framework.TestCase;
+import org.apache.pdfbox.io.IOUtils;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
+import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
+import org.apache.pdfbox.pdmodel.common.COSObjectable;
+import 
org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
 import org.junit.Assert;
 
 /**
@@ -30,7 +43,7 @@ public class EndstreamOutputStreamTest e
     public void testEndstreamOutputStream() throws IOException
     {
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        EndstreamOutputStream feos = new EndstreamOutputStream(baos);
+        EndstreamOutputStream feos = new EndstreamOutputStream(baos, true);
         byte tab1[] = {1, 2, 3, 4};
         byte tab2[] = {5, 6, 7, '\r', '\n'};
         byte tab3[] = {8, 9, '\r', '\n'};
@@ -42,7 +55,7 @@ public class EndstreamOutputStreamTest e
         Assert.assertArrayEquals(expectedResult1, baos.toByteArray());
 
         baos = new ByteArrayOutputStream();
-        feos = new EndstreamOutputStream(baos);
+        feos = new EndstreamOutputStream(baos, true);
         byte tab4[] = {1, 2, 3, 4};
         byte tab5[] = {5, 6, 7, '\r' };
         byte tab6[] = {8, 9, '\n'};
@@ -54,7 +67,7 @@ public class EndstreamOutputStreamTest e
         Assert.assertArrayEquals(expectedResult2, baos.toByteArray());
         
         baos = new ByteArrayOutputStream();
-        feos = new EndstreamOutputStream(baos);
+        feos = new EndstreamOutputStream(baos, true);
         byte tab7[] = {1, 2, 3, 4, '\r'};
         byte tab8[] = {'\n', 5, 6, 7, '\n' };
         byte tab9[] = {8, 9, '\r'}; // final CR is not to be discarded
@@ -66,7 +79,7 @@ public class EndstreamOutputStreamTest e
         Assert.assertArrayEquals(expectedResult3, baos.toByteArray());
         
         baos = new ByteArrayOutputStream();
-        feos = new EndstreamOutputStream(baos);
+        feos = new EndstreamOutputStream(baos, true);
         byte tab10[] = {1, 2, 3, 4, '\r'};
         byte tab11[] = {'\n', 5, 6, 7, '\r' };
         byte tab12[] = {8, 9, '\r'};
@@ -80,7 +93,7 @@ public class EndstreamOutputStreamTest e
         Assert.assertArrayEquals(expectedResult4, baos.toByteArray());
 
         baos = new ByteArrayOutputStream();
-        feos = new EndstreamOutputStream(baos);
+        feos = new EndstreamOutputStream(baos, true);
         byte tab14[] = {1, 2, 3, 4, '\r'};
         byte tab15[] = {'\n', 5, 6, 7, '\r' };
         byte tab16[] = {8, 9, '\n'};
@@ -92,6 +105,38 @@ public class EndstreamOutputStreamTest e
         feos.flush();
         byte expectedResult5[] = { 1, 2, 3, 4, '\r', '\n', 5, 6, 7, '\r', 8, 
9, '\n', '\r'};
         Assert.assertArrayEquals(expectedResult5, baos.toByteArray());
+
+        baos = new ByteArrayOutputStream();
+        feos = new EndstreamOutputStream(baos, false);
+        feos.write(tab10, 0, tab14.length);
+        feos.write(tab11, 0, tab15.length);
+        feos.write(tab12, 0, tab16.length);
+        feos.write(tab13, 0, tab17.length);
+        feos.flush();
+        // CR is not to be discarded whean passing false for searchCR in the 
EndstreamOutputStream constructor 
+        byte expectedResult6[] = {  1, 2, 3, 4, '\r', '\n', 5, 6, 7, '\r', 8, 
9, '\r'};
+        Assert.assertArrayEquals(expectedResult6, baos.toByteArray());
+    }
+
+    public void testPDFBox2079EmbeddedFile() throws IOException
+    {
+        // there should be 17660 bytes in the zip file.
+        // in PDFBox 1.8.5, windows newline is appended to the byte stream
+        // yielding 17662 bytes, which causes a problem for ZipFile in Java 1.6
+        PDDocument doc = PDDocument.load(new 
File("src/test/resources/org/apache/pdfbox/pdfparser", "embedded_zip.pdf"));
+        PDDocumentCatalog catalog = doc.getDocumentCatalog();
+        PDDocumentNameDictionary names = catalog.getNames();
+        PDEmbeddedFilesNameTreeNode node = names.getEmbeddedFiles();
+        Map<String, COSObjectable> map = node.getNames();
+        Assert.assertEquals(1, map.size());
+        PDComplexFileSpecification spec = (PDComplexFileSpecification) 
map.get("My first attachment");
+        PDEmbeddedFile file = spec.getEmbeddedFile();
+        InputStream input = file.createInputStream();
+        File f = new File("target/test-output", spec.getFile());
+        OutputStream os = new FileOutputStream(f);
+        IOUtils.copy(input, os);
+        os.close();
+        Assert.assertEquals(17660, f.length());
+        doc.close();
     }
-    
 }

Added: 
pdfbox/branches/1.8/pdfbox/src/test/resources/org/apache/pdfbox/pdfparser/embedded_zip.pdf
URL: 
http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/test/resources/org/apache/pdfbox/pdfparser/embedded_zip.pdf?rev=1808931&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
pdfbox/branches/1.8/pdfbox/src/test/resources/org/apache/pdfbox/pdfparser/embedded_zip.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/pdf


Reply via email to