Author: lehmi
Date: Sat Oct 13 13:55:30 2012
New Revision: 1397829

URL: http://svn.apache.org/viewvc?rev=1397829&view=rev
Log:
PDFBOX-1130: improved paragraph end tag handling as proposed by Mike McCandless

Modified:
    
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1397829&r1=1397828&r2=1397829&view=diff
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java 
(original)
+++ 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java 
Sat Oct 13 13:55:30 2012
@@ -183,6 +183,13 @@ public class PDFTextStripper extends PDF
     private TextNormalize normalize = null;
 
     /**
+     * True if we started a paragraph but haven't ended it
+     * yet.
+     */
+    private boolean inParagraph;
+
+
+    /**
      * Instantiate a new PDFTextStripper object. This object will load
      * properties from PDFTextStripper.properties and will not do
      * anything special to convert the text to a more encoding-specific
@@ -1724,7 +1731,13 @@ public class PDFTextStripper extends PDF
      */
     protected void writeParagraphStart() throws IOException
     {
+        if (inParagraph) 
+        {
+            writeParagraphEnd();
+            inParagraph = false;
+        }
         output.write(getParagraphStart());
+        inParagraph = true;
     }
 
     /**
@@ -1734,6 +1747,7 @@ public class PDFTextStripper extends PDF
     protected void writeParagraphEnd() throws IOException
     {
         output.write(getParagraphEnd());
+        inParagraph = false;
     }
 
     /**


Reply via email to