Author: jahewson
Date: Wed Jun 18 19:53:38 2014
New Revision: 1603606
URL: http://svn.apache.org/r1603606
Log:
PDFBOX-2145: Clean up TextPosition
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java?rev=1603606&r1=1603605&r2=1603606&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
(original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
Wed Jun 18 19:53:38 2014
@@ -16,7 +16,6 @@
*/
package org.apache.pdfbox.text;
-import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.util.Matrix;
@@ -25,89 +24,33 @@ import org.apache.pdfbox.util.Matrix;
*
* @author Ben Litchfield
*/
-public class TextPosition
+public final class TextPosition
{
// text matrix for the start of the text object, coordinates are in
display units
// and have not been adjusted
- private Matrix textPos;
+ private final Matrix textPos;
// ending X and Y coordinates in display units
- private float endX;
- private float endY;
+ private final float endX;
+ private final float endY;
- private float maxTextHeight; // maximum height of text, in display units
- private int rot; // 0, 90, 180, 270 degrees of page rotation
- private float x = Float.NEGATIVE_INFINITY;
- private float y = Float.NEGATIVE_INFINITY;
- private float pageHeight;
- private float pageWidth;
+ private final float maxTextHeight; // maximum height of text, in display
units
+ private final int rotation; // 0, 90, 180, 270 degrees of page rotation
+ private final float x = Float.NEGATIVE_INFINITY;
+ private final float y = Float.NEGATIVE_INFINITY;
+ private final float pageHeight;
+ private final float pageWidth;
+
+ private final float widthOfSpace; // width of a space, in display units
+
+ private final int[] unicodeCP;
+ private final PDFont font;
+ private final float fontSize;
+ private final int fontSizePt;
+
+ // mutable
private float[] widths;
- private float widthOfSpace; // width of a space, in display units
private String string;
- private int[] unicodeCP;
- private PDFont font;
- private float fontSize;
- private int fontSizePt;
-
- /**
- * Constructor.
- *
- * @deprecated Can this be removed?
- */
- @Deprecated
- protected TextPosition()
- {
- }
-
- /**
- * Constructor.
- *
- * @param page Page that the text is located in
- * @param textPositionSt TextMatrix for start of text (in display units)
- * @param textPositionEnd TextMatrix for end of text (in display units)
- * @param maxFontH Maximum height of text (in display units)
- * @param individualWidths The width of each individual character. (in ?
units)
- * @param spaceWidth The width of the space character. (in display units)
- * @param string The character to be displayed.
- * @param currentFont The current for for this text position.
- * @param fontSizeValue The new font size.
- * @param fontSizeInPt The font size in pt units.
- * @param ws The word spacing parameter (in display units)
- *
- * @deprecated Can this be removed?
- */
- @Deprecated
- public TextPosition(PDPage page, Matrix textPositionSt, Matrix
textPositionEnd, float maxFontH,
- float[] individualWidths, float spaceWidth, String
string,
- PDFont currentFont, float fontSizeValue, int
fontSizeInPt, float ws)
- {
- this.textPos = textPositionSt;
-
- this.endX = textPositionEnd.getXPosition();
- this.endY = textPositionEnd.getYPosition();
-
- this.rot = page.findRotation();
- // make sure it is 0 to 270 and no negative numbers
- if (this.rot < 0)
- {
- rot += 360;
- }
- else if (rot >= 360)
- {
- rot -= 360;
- }
-
- this.maxTextHeight = maxFontH;
- this.pageHeight = page.findMediaBox().getHeight();
- this.pageWidth = page.findMediaBox().getWidth();
-
- this.widths = individualWidths;
- this.widthOfSpace = spaceWidth;
- this.string = string;
- this.font = currentFont;
- this.fontSize = fontSizeValue;
- this.fontSizePt = fontSizeInPt;
- }
/**
* Constructor.
@@ -137,16 +80,17 @@ public class TextPosition
this.endX = endXValue;
this.endY = endYValue;
- this.rot = pageRotation;
+ int rotation = pageRotation;
// make sure it is 0 to 270 and no negative numbers
- if (this.rot < 0)
+ if (rotation < 0)
{
- rot += 360;
+ rotation += 360;
}
- else if (rot >= 360)
+ else if (rotation >= 360)
{
- rot -= 360;
+ rotation -= 360;
}
+ this.rotation = rotation;
this.maxTextHeight = maxFontH;
this.pageHeight = pageHeightValue;
@@ -176,7 +120,7 @@ public class TextPosition
*
* @return an array containing all codepoints.
*/
- public int[] getCodePoints()
+ public int[] getCodePoints() // todo: NOT Unicode!!
{
return unicodeCP;
}
@@ -267,7 +211,7 @@ public class TextPosition
{
if (x == Float.NEGATIVE_INFINITY)
{
- x = getXRot(rot);
+ return getXRot(rotation);
}
return x;
}
@@ -322,13 +266,13 @@ public class TextPosition
{
if (y == Float.NEGATIVE_INFINITY)
{
- if (rot == 0 || rot == 180)
+ if (rotation == 0 || rotation == 180)
{
- y = pageHeight - getYLowerLeftRot(rot);
+ return pageHeight - getYLowerLeftRot(rotation);
}
else
{
- y = pageWidth - getYLowerLeftRot(rot);
+ return pageWidth - getYLowerLeftRot(rotation);
}
}
return y;
@@ -379,7 +323,7 @@ public class TextPosition
*/
public float getWidth()
{
- return getWidthRot(rot);
+ return getWidthRot(rotation);
}
/**
@@ -608,6 +552,7 @@ public class TextPosition
currCharXStart += widths[i];
}
}
+
/**
* Inserts the diacritic TextPosition to the str of this TextPosition and
updates the widths
* array to include the extra character width.
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java?rev=1603606&r1=1603605&r2=1603606&view=diff
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
(original)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java
Wed Jun 18 19:53:38 2014
@@ -269,17 +269,6 @@ public class PDFStreamEngine
}
/**
- * A method provided as an event interface to allow a subclass to perform
some specific
- * functionality when text needs to be processed.
- *
- * @param text The text to be processed.
- */
- protected void processTextPosition(TextPosition text)
- {
- // subclasses can override to provide specific functionality.
- }
-
- /**
* Process encoded text from the PDF Stream. You should override this
method if you want to
* perform an action when encoded text is being processed.
*
@@ -469,6 +458,17 @@ public class PDFStreamEngine
}
/**
+ * A method provided as an event interface to allow a subclass to perform
some specific
+ * functionality when text needs to be processed.
+ *
+ * @param text The text to be processed.
+ */
+ protected void processTextPosition(TextPosition text)
+ {
+ // subclasses can override to provide specific functionality.
+ }
+
+ /**
* This is used to handle an operation.
*
* @param operation The operation to perform.
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1603606&r1=1603605&r2=1603606&view=diff
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
(original)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Wed Jun 18 19:53:38 2014
@@ -574,7 +574,7 @@ public class PDFTextStripper extends PDF
// Now cycle through to print the text.
// We queue up a line at a time before we print so that we can
convert
// the line from presentation form to logical form (if needed).
- List<TextPosition> line = new ArrayList<TextPosition>();
+ List<LineItem> line = new ArrayList<LineItem>();
textIter = textList.iterator(); // start from the beginning
again
// PDF files don't always store spaces. We will need to guess
where we should add
@@ -709,7 +709,7 @@ public class PDFTextStripper extends PDF
lastPosition.getTextPosition().getCharacter() != null
&&
!lastPosition.getTextPosition().getCharacter().endsWith(" "))
{
- line.add(WordSeparator.getSeparator());
+ line.add(LineItem.getWordSeparator());
}
}
if (positionY >= maxYForLine)
@@ -727,7 +727,7 @@ public class PDFTextStripper extends PDF
{
writeParagraphStart();//not sure this is correct for
RTL?
}
- line.add(position);
+ line.add(new LineItem(position));
}
maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
minYTopForLine = Math.min(minYTopForLine, positionY -
positionHeight);
@@ -1784,7 +1784,7 @@ public class PDFTextStripper extends PDF
* @param hasRtl determines if lines contains rtl formatted text(parts)
* @return a list of strings, one string for every word
*/
- private List<WordWithTextPositions> normalize(List<TextPosition> line,
boolean isRtlDominant,
+ private List<WordWithTextPositions> normalize(List<LineItem> line, boolean
isRtlDominant,
boolean hasRtl)
{
LinkedList<WordWithTextPositions> normalized = new
LinkedList<WordWithTextPositions>();
@@ -1801,9 +1801,9 @@ public class PDFTextStripper extends PDF
}
else
{
- for (TextPosition text : line)
+ for (LineItem item : line)
{
- lineBuilder = normalizeAdd(normalized, lineBuilder,
wordPositions, text);
+ lineBuilder = normalizeAdd(normalized, lineBuilder,
wordPositions, item);
}
}
if (lineBuilder.length() > 0)
@@ -1827,9 +1827,9 @@ public class PDFTextStripper extends PDF
* @return The StringBuilder that must be used when calling this method.
*/
private StringBuilder normalizeAdd(LinkedList<WordWithTextPositions>
normalized,
- StringBuilder lineBuilder, List<TextPosition> wordPositions,
TextPosition text)
+ StringBuilder lineBuilder, List<TextPosition> wordPositions,
LineItem item)
{
- if (text instanceof WordSeparator)
+ if (item.isWordSeparator())
{
normalized.add(createWord(lineBuilder.toString(),
new ArrayList<TextPosition>(wordPositions)));
@@ -1838,6 +1838,7 @@ public class PDFTextStripper extends PDF
}
else
{
+ TextPosition text = item.getTextPosition();
lineBuilder.append(text.getCharacter());
wordPositions.add(text);
}
@@ -1847,17 +1848,35 @@ public class PDFTextStripper extends PDF
/**
* internal marker class. Used as a place holder in a line of
TextPositions.
*/
- private static final class WordSeparator extends TextPosition
+ private static final class LineItem
{
- private static final WordSeparator separator = new WordSeparator();
-
- private WordSeparator()
+ public static LineItem WORD_SEPARATOR = new LineItem();
+
+ public static LineItem getWordSeparator()
+ {
+ return WORD_SEPARATOR;
+ }
+
+ private final TextPosition textPosition;
+
+ private LineItem()
+ {
+ textPosition = null;
+ }
+
+ public LineItem(TextPosition textPosition)
+ {
+ this.textPosition = textPosition;
+ }
+
+ public TextPosition getTextPosition()
{
+ return textPosition;
}
- public static WordSeparator getSeparator()
+ public boolean isWordSeparator()
{
- return separator;
+ return textPosition == null;
}
}
Modified:
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java?rev=1603606&r1=1603605&r2=1603606&view=diff
==============================================================================
---
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
(original)
+++
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
Wed Jun 18 19:53:38 2014
@@ -273,9 +273,8 @@ public class TestTextStripper extends Te
if (!expectedFile.exists())
{
this.bFail = true;
- log.error(
- "FAILURE: Input verification file: " +
expectedFile.getAbsolutePath() +
- " did not exist");
+ fail("FAILURE: Input verification file: " +
expectedFile.getAbsolutePath() +
+ " did not exist");
return;
}
@@ -299,12 +298,12 @@ public class TestTextStripper extends Te
if (!stringsEqual(expectedLine, actualLine))
{
this.bFail = true;
- log.error("FAILURE: Line mismatch for file " +
inFile.getName() +
+ fail("FAILURE: Line mismatch for file " + inFile.getName()
+
" ( sort = "+bSort+")" +
" at expected line: " +
expectedReader.getLineNumber() +
- " at actual line: " +
actualReader.getLineNumber());
- log.error(" expected line was: \"" + expectedLine + "\"");
- log.error(" actual line was: \"" + actualLine + "\"" +
"\n");
+ " at actual line: " + actualReader.getLineNumber()
+
+ "\nexpected line was: \"" + expectedLine + "\"" +
+ "\nactual line was: \"" + actualLine + "\"" +
"\n");
//lets report all lines, even though this might produce
some verbose logging
//break;