Author: lehmi
Date: Sat Oct 13 13:55:30 2012
New Revision: 1397829
URL: http://svn.apache.org/viewvc?rev=1397829&view=rev
Log:
PDFBOX-1130: improved paragraph end tag handling as proposed by Mike McCandless
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1397829&r1=1397828&r2=1397829&view=diff
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
(original)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Sat Oct 13 13:55:30 2012
@@ -183,6 +183,13 @@ public class PDFTextStripper extends PDF
private TextNormalize normalize = null;
/**
+ * True if we started a paragraph but haven't ended it
+ * yet.
+ */
+ private boolean inParagraph;
+
+
+ /**
* Instantiate a new PDFTextStripper object. This object will load
* properties from PDFTextStripper.properties and will not do
* anything special to convert the text to a more encoding-specific
@@ -1724,7 +1731,13 @@ public class PDFTextStripper extends PDF
*/
protected void writeParagraphStart() throws IOException
{
+ if (inParagraph)
+ {
+ writeParagraphEnd();
+ inParagraph = false;
+ }
output.write(getParagraphStart());
+ inParagraph = true;
}
/**
@@ -1734,6 +1747,7 @@ public class PDFTextStripper extends PDF
protected void writeParagraphEnd() throws IOException
{
output.write(getParagraphEnd());
+ inParagraph = false;
}
/**