Author: tilman
Date: Tue Sep 19 19:05:48 2017
New Revision: 1808931
URL: http://svn.apache.org/viewvc?rev=1808931&view=rev
Log:
PDFBOX-3933: don't swallow CR at end of stream if there is one at the
beginning; add testcase from issue 2079
Added:
pdfbox/branches/1.8/pdfbox/src/test/resources/org/apache/pdfbox/pdfparser/embedded_zip.pdf
(with props)
Modified:
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/EndstreamOutputStreamTest.java
Modified:
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL:
http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1808931&r1=1808930&r2=1808931&view=diff
==============================================================================
---
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
(original)
+++
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
Tue Sep 19 19:05:48 2017
@@ -419,6 +419,9 @@ public abstract class BaseParser
{
throw new IOException("expected='stream' actual='" +
streamString + "' at offset " + pdfSource.getOffset());
}
+ // Flag indicating whether a CR character was encountered after
the "stream" keyword, in which case we expect it again before "endstream"
+ boolean hasCR = false;
+
//PDF Ref 3.2.7 A stream must be followed by either
//a CRLF or LF but nothing else.
@@ -435,6 +438,7 @@ public abstract class BaseParser
if( whitespace == 0x0D )
{
+ hasCR = true;
whitespace = pdfSource.read();
if( whitespace != 0x0A )
{
@@ -487,7 +491,7 @@ public abstract class BaseParser
{
// Couldn't determine length from dict: just
// scan until we find endstream:
- readUntilEndStream(new EndstreamOutputStream(out));
+ readUntilEndStream(new EndstreamOutputStream(out, hasCR));
}
else
{
@@ -564,7 +568,7 @@ public abstract class BaseParser
IOUtils.closeQuietly(out);
out = stream.createFilteredStream( streamLength );
// scan until we find endstream:
- readUntilEndStream(new EndstreamOutputStream(out));
+ readUntilEndStream(new EndstreamOutputStream(out,
hasCR));
}
}
}
@@ -602,7 +606,7 @@ public abstract class BaseParser
* If for some reason we get something else here, Read
until we find the next
* "endstream"
*/
- readUntilEndStream(new EndstreamOutputStream(out));
+ readUntilEndStream(new EndstreamOutputStream(out, hasCR));
endStream = readString();
if( !endStream.equals( ENDSTREAM_STRING ) )
{
Modified:
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java
URL:
http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java?rev=1808931&r1=1808930&r2=1808931&view=diff
==============================================================================
---
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java
(original)
+++
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/EndstreamOutputStream.java
Tue Sep 19 19:05:48 2017
@@ -39,9 +39,13 @@ class EndstreamOutputStream extends Buff
private int pos = 0;
private boolean mustFilter = true;
- public EndstreamOutputStream(OutputStream out)
+ // Flag indicating whether we search for CR+LF or for LF only
+ private final boolean searchCR;
+
+ public EndstreamOutputStream(OutputStream out, boolean searchCR)
{
super(out);
+ this.searchCR = searchCR;
}
/**
@@ -96,7 +100,7 @@ class EndstreamOutputStream extends Buff
// don't write CR, LF, or CR LF if at the end of the buffer
if (len > 0)
{
- if (b[off + len - 1] == '\r')
+ if (searchCR && b[off + len - 1] == '\r')
{
hasCR = true;
--len;
@@ -105,7 +109,7 @@ class EndstreamOutputStream extends Buff
{
hasLF = true;
--len;
- if (len > 0 && b[off + len - 1] == '\r')
+ if (searchCR && len > 0 && b[off + len - 1] == '\r')
{
hasCR = true;
--len;
Modified:
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL:
http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1808931&r1=1808930&r2=1808931&view=diff
==============================================================================
---
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
(original)
+++
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
Tue Sep 19 19:05:48 2017
@@ -1748,8 +1748,11 @@ public class NonSequentialPDFParser exte
whitespace = pdfSource.read();
}
+ boolean hasCR = false;
+
if (whitespace == 0x0D)
{
+ hasCR = true;
whitespace = pdfSource.read();
if (whitespace != 0x0A)
{
@@ -1811,7 +1814,7 @@ public class NonSequentialPDFParser exte
if (useReadUntilEnd)
{
out = stream.createFilteredStream();
- readUntilEndStream(new EndstreamOutputStream(out));
+ readUntilEndStream(new EndstreamOutputStream(out, hasCR));
}
String endStream = readString();
if (endStream.equals("endobj") && isLenient)
Modified:
pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/EndstreamOutputStreamTest.java
URL:
http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/EndstreamOutputStreamTest.java?rev=1808931&r1=1808930&r2=1808931&view=diff
==============================================================================
---
pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/EndstreamOutputStreamTest.java
(original)
+++
pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/EndstreamOutputStreamTest.java
Tue Sep 19 19:05:48 2017
@@ -17,8 +17,21 @@
package org.apache.pdfbox.pdfparser;
import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Map;
import junit.framework.TestCase;
+import org.apache.pdfbox.io.IOUtils;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
+import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
+import org.apache.pdfbox.pdmodel.common.COSObjectable;
+import
org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.junit.Assert;
/**
@@ -30,7 +43,7 @@ public class EndstreamOutputStreamTest e
public void testEndstreamOutputStream() throws IOException
{
ByteArrayOutputStream baos = new ByteArrayOutputStream();
- EndstreamOutputStream feos = new EndstreamOutputStream(baos);
+ EndstreamOutputStream feos = new EndstreamOutputStream(baos, true);
byte tab1[] = {1, 2, 3, 4};
byte tab2[] = {5, 6, 7, '\r', '\n'};
byte tab3[] = {8, 9, '\r', '\n'};
@@ -42,7 +55,7 @@ public class EndstreamOutputStreamTest e
Assert.assertArrayEquals(expectedResult1, baos.toByteArray());
baos = new ByteArrayOutputStream();
- feos = new EndstreamOutputStream(baos);
+ feos = new EndstreamOutputStream(baos, true);
byte tab4[] = {1, 2, 3, 4};
byte tab5[] = {5, 6, 7, '\r' };
byte tab6[] = {8, 9, '\n'};
@@ -54,7 +67,7 @@ public class EndstreamOutputStreamTest e
Assert.assertArrayEquals(expectedResult2, baos.toByteArray());
baos = new ByteArrayOutputStream();
- feos = new EndstreamOutputStream(baos);
+ feos = new EndstreamOutputStream(baos, true);
byte tab7[] = {1, 2, 3, 4, '\r'};
byte tab8[] = {'\n', 5, 6, 7, '\n' };
byte tab9[] = {8, 9, '\r'}; // final CR is not to be discarded
@@ -66,7 +79,7 @@ public class EndstreamOutputStreamTest e
Assert.assertArrayEquals(expectedResult3, baos.toByteArray());
baos = new ByteArrayOutputStream();
- feos = new EndstreamOutputStream(baos);
+ feos = new EndstreamOutputStream(baos, true);
byte tab10[] = {1, 2, 3, 4, '\r'};
byte tab11[] = {'\n', 5, 6, 7, '\r' };
byte tab12[] = {8, 9, '\r'};
@@ -80,7 +93,7 @@ public class EndstreamOutputStreamTest e
Assert.assertArrayEquals(expectedResult4, baos.toByteArray());
baos = new ByteArrayOutputStream();
- feos = new EndstreamOutputStream(baos);
+ feos = new EndstreamOutputStream(baos, true);
byte tab14[] = {1, 2, 3, 4, '\r'};
byte tab15[] = {'\n', 5, 6, 7, '\r' };
byte tab16[] = {8, 9, '\n'};
@@ -92,6 +105,38 @@ public class EndstreamOutputStreamTest e
feos.flush();
byte expectedResult5[] = { 1, 2, 3, 4, '\r', '\n', 5, 6, 7, '\r', 8,
9, '\n', '\r'};
Assert.assertArrayEquals(expectedResult5, baos.toByteArray());
+
+ baos = new ByteArrayOutputStream();
+ feos = new EndstreamOutputStream(baos, false);
+ feos.write(tab10, 0, tab14.length);
+ feos.write(tab11, 0, tab15.length);
+ feos.write(tab12, 0, tab16.length);
+ feos.write(tab13, 0, tab17.length);
+ feos.flush();
+ // CR is not to be discarded whean passing false for searchCR in the
EndstreamOutputStream constructor
+ byte expectedResult6[] = { 1, 2, 3, 4, '\r', '\n', 5, 6, 7, '\r', 8,
9, '\r'};
+ Assert.assertArrayEquals(expectedResult6, baos.toByteArray());
+ }
+
+ public void testPDFBox2079EmbeddedFile() throws IOException
+ {
+ // there should be 17660 bytes in the zip file.
+ // in PDFBox 1.8.5, windows newline is appended to the byte stream
+ // yielding 17662 bytes, which causes a problem for ZipFile in Java 1.6
+ PDDocument doc = PDDocument.load(new
File("src/test/resources/org/apache/pdfbox/pdfparser", "embedded_zip.pdf"));
+ PDDocumentCatalog catalog = doc.getDocumentCatalog();
+ PDDocumentNameDictionary names = catalog.getNames();
+ PDEmbeddedFilesNameTreeNode node = names.getEmbeddedFiles();
+ Map<String, COSObjectable> map = node.getNames();
+ Assert.assertEquals(1, map.size());
+ PDComplexFileSpecification spec = (PDComplexFileSpecification)
map.get("My first attachment");
+ PDEmbeddedFile file = spec.getEmbeddedFile();
+ InputStream input = file.createInputStream();
+ File f = new File("target/test-output", spec.getFile());
+ OutputStream os = new FileOutputStream(f);
+ IOUtils.copy(input, os);
+ os.close();
+ Assert.assertEquals(17660, f.length());
+ doc.close();
}
-
}
Added:
pdfbox/branches/1.8/pdfbox/src/test/resources/org/apache/pdfbox/pdfparser/embedded_zip.pdf
URL:
http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/test/resources/org/apache/pdfbox/pdfparser/embedded_zip.pdf?rev=1808931&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
pdfbox/branches/1.8/pdfbox/src/test/resources/org/apache/pdfbox/pdfparser/embedded_zip.pdf
------------------------------------------------------------------------------
svn:mime-type = application/pdf