Author: msahyoun Date: Mon Aug 15 19:31:58 2016 New Revision: 1756416 URL: http://svn.apache.org/viewvc?rev=1756416&view=rev Log: PDFBOX-3461: add handling for empty paragraphs; enhance unit tests; compare streams
Added: pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/interactive/form/PlainTextTest.java - copied unchanged from r1756411, pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/interactive/form/PlainTextTest.java Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/interactive/form/PlainText.java pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/interactive/form/ControlCharacterTest.java Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/interactive/form/PlainText.java URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/interactive/form/PlainText.java?rev=1756416&r1=1756415&r2=1756416&view=diff ============================================================================== --- pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/interactive/form/PlainText.java (original) +++ pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/interactive/form/PlainText.java Mon Aug 15 19:31:58 2016 @@ -55,6 +55,11 @@ class PlainText paragraphs = new ArrayList<Paragraph>(); for (String part : parts) { + // Acrobat prints a space for an empty paragraph + if (part.length() == 0) + { + part = " "; + } paragraphs.add(new Paragraph(part)); } } Modified: pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/interactive/form/ControlCharacterTest.java URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/interactive/form/ControlCharacterTest.java?rev=1756416&r1=1756415&r2=1756416&view=diff ============================================================================== --- pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/interactive/form/ControlCharacterTest.java (original) +++ pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdmodel/interactive/form/ControlCharacterTest.java Mon Aug 15 19:31:58 2016 @@ -16,10 +16,22 @@ */ package org.apache.pdfbox.pdmodel.interactive.form; +import static org.junit.Assert.assertEquals; + import java.io.File; import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Deque; +import java.util.List; +import org.apache.pdfbox.cos.COSString; +import org.apache.pdfbox.pdfparser.PDFStreamParser; import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAppearanceStream; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -78,45 +90,85 @@ public class ControlCharacterTest { @Test public void characterSPACE() throws IOException { - acroForm.getField("pdfbox-space").setValue("SPACE SPACE"); + PDField field = acroForm.getField("pdfbox-space"); + field.setValue("SPACE SPACE"); + + List<String> pdfboxValues = getStringsFromStream(field); + List<String> acrobatValues = getStringsFromStream(acroForm.getField("acrobat-space")); + + assertEquals(pdfboxValues, acrobatValues); } @Test public void characterCR() throws IOException { - acroForm.getField("pdfbox-cr").setValue("CR\rCR"); + PDField field = acroForm.getField("pdfbox-cr"); + field.setValue("CR\rCR"); + + List<String> pdfboxValues = getStringsFromStream(field); + List<String> acrobatValues = getStringsFromStream(acroForm.getField("acrobat-cr")); + + assertEquals(pdfboxValues, acrobatValues); } @Test public void characterLF() throws IOException { - acroForm.getField("pdfbox-lf").setValue("LF\nLF"); + PDField field = acroForm.getField("pdfbox-lf"); + field.setValue("LF\nLF"); + + List<String> pdfboxValues = getStringsFromStream(field); + List<String> acrobatValues = getStringsFromStream(acroForm.getField("acrobat-lf")); + + assertEquals(pdfboxValues, acrobatValues); } @Test public void characterCRLF() throws IOException { - acroForm.getField("pdfbox-crlf").setValue("CRLF\r\nCRLF"); + PDField field = acroForm.getField("pdfbox-crlf"); + field.setValue("CRLF\r\nCRLF"); + + List<String> pdfboxValues = getStringsFromStream(field); + List<String> acrobatValues = getStringsFromStream(acroForm.getField("acrobat-crlf")); + + assertEquals(pdfboxValues, acrobatValues); } @Test public void characterLFCR() throws IOException { - acroForm.getField("pdfbox-lfcr").setValue("LFCR\r\nLFCR"); + PDField field = acroForm.getField("pdfbox-lfcr"); + field.setValue("LFCR\n\rLFCR"); + + List<String> pdfboxValues = getStringsFromStream(field); + List<String> acrobatValues = getStringsFromStream(acroForm.getField("acrobat-lfcr")); + + assertEquals(pdfboxValues, acrobatValues); } @Test public void characterUnicodeLinebreak() throws IOException { - acroForm.getField("pdfbox-linebreak").setValue("linebreak\u2028linebreak"); + PDField field = acroForm.getField("pdfbox-linebreak"); + field.setValue("linebreak\u2028linebreak"); + List<String> pdfboxValues = getStringsFromStream(field); + List<String> acrobatValues = getStringsFromStream(acroForm.getField("acrobat-linebreak")); + + assertEquals(pdfboxValues, acrobatValues); } @Test public void characterUnicodeParagraphbreak() throws IOException { - acroForm.getField("pdfbox-paragraphbreak").setValue("paragraphbreak\u2029paragraphbreak"); + PDField field = acroForm.getField("pdfbox-paragraphbreak"); + field.setValue("paragraphbreak\u2029paragraphbreak"); + List<String> pdfboxValues = getStringsFromStream(field); + List<String> acrobatValues = getStringsFromStream(acroForm.getField("acrobat-paragraphbreak")); + + assertEquals(pdfboxValues, acrobatValues); } @After @@ -124,4 +176,27 @@ public class ControlCharacterTest { { document.close(); } + + private List<String> getStringsFromStream(PDField field) throws IOException + { + PDAnnotationWidget widget = field.getWidgets().get(0); + PDFStreamParser parser = new PDFStreamParser(widget.getNormalAppearanceStream()); + + Object token = parser.parseNextToken(); + + List<String> stringValues = new ArrayList<String>(); + + while (token != null) + { + if (token instanceof COSString) + { + // TODO: improve the string output to better match + // trimming as Acrobat adds spaces to strings + // where we don't + stringValues.add(((COSString) token).getString().trim()); + } + token = parser.parseNextToken(); + } + return stringValues; + } }