Hi all,
I found out when trying to use PDFBox that it doesn't support filling in
multiline text boxes correctly [1] so I've created a patch which
improves multiline (and auto-sizing) support.
The text positioning algorithm is still black magic and was tweaked
until text in a multiline text boxes started close to where Acrobat put it.
The maximum size of auto-sized fonts in multiline text boxes of 12 was
chosen as that is what Acrobat appeared to do. Not sure if this was from
limited test data or it's defined somewhere...
I've been using a PDF which contains numerous text boxes of different
styles to test it and what PDFBox produces is very similar to what Adobe
Acrobat Professional produces but this PDF is copyrighted by a third
party. Does anyone know of decent source of PDFs with text fields in
them or would it be acceptable to create tests which just use mock
objects? I assume tests are required before creating a JIRA and
attaching the patch...
Cheers,
Will.
1 Main problems found were problems like
* Not positioning the start of the text correctly
* Not displaying the text at all (auto-sized text boxes)
* Not adding line breaks to long lines
* Very large fonts had the next line of text start too close to the line
above.
Index: parent/pom.xml
===================================================================
--- parent/pom.xml (revision 1366377)
+++ parent/pom.xml (working copy)
@@ -48,6 +48,12 @@
<properties>
<project.build.sourceEncoding>ISO-8859-1</project.build.sourceEncoding>
</properties>
+
+ <scm>
+ <connection>scm:svn:https://svn.apache.org/repos/asf/pdfbox/trunk/parent</connection>
+ <developerConnection>scm:svn:https://svn.apache.org/repos/asf/pdfbox/trunk/parent</developerConnection>
+ <url>http://svn.apache.org/viewvc/pdfbox/trunk/parent/</url>
+ </scm>
<profiles>
<profile>
Index: pdfbox/src/main/java/org/apache/pdfbox/pdmodel/interactive/form/PDAppearance.java
===================================================================
--- pdfbox/src/main/java/org/apache/pdfbox/pdmodel/interactive/form/PDAppearance.java (revision 1366377)
+++ pdfbox/src/main/java/org/apache/pdfbox/pdmodel/interactive/form/PDAppearance.java (working copy)
@@ -26,6 +26,7 @@
import java.util.Iterator;
import java.util.List;
import java.util.Map;
+import java.util.regex.Pattern;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSDictionary;
@@ -69,6 +70,10 @@
*/
public class PDAppearance
{
+
+ private static final Pattern PDF_NEW_LINE_REGEX = Pattern.compile("\\) Tj\\n0 -[\\d\\.]+ Td\\n\\(");
+ private static final Pattern NEW_LINE = Pattern.compile("\n");
+
private PDVariableText parent;
private String value;
@@ -209,11 +214,6 @@
*/
public void setAppearanceValue(String apValue) throws IOException
{
- // MulitLine check and set
- if ( parent.isMultiline() && apValue.indexOf('\n') != -1 )
- {
- apValue = convertToMultiLine( apValue );
- }
value = apValue;
Iterator<COSObjectable> widgetIter = widgets.iterator();
@@ -387,7 +387,21 @@
q == PDTextbox.QUADDING_RIGHT )
{
float fieldWidth = boundingBox.getWidth();
- float stringWidth = (pdFont.getStringWidth( value )/1000)*fontSize;
+
+ float stringWidth = Float.MIN_VALUE;
+
+ if (parent.isMultiline())
+ {
+ for (String line : PDF_NEW_LINE_REGEX.split(value))
+ {
+ stringWidth = Math.max(stringWidth, (pdFont.getStringWidth( line )/1000)*fontSize);
+ }
+ }
+ else
+ {
+ stringWidth = (pdFont.getStringWidth( value )/1000)*fontSize;
+ }
+
float adjustAmount = fieldWidth - stringWidth - 4;
if( q == PDTextbox.QUADDING_CENTERED )
@@ -443,19 +457,25 @@
return retval;
}
- private String convertToMultiLine( String line )
+ /**
+ * Replace all new lines with the appropriate Td commands.
+ * @param pdFont Font being used to display the text.
+ * @param fontSize Font size being used to display the text.
+ */
+ private void convertToMultiLine( PDFont pdFont, float fontSize )
{
- int currIdx = 0;
- int lastIdx = 0;
- StringBuffer result = new StringBuffer(line.length() + 64);
- while( (currIdx = line.indexOf('\n',lastIdx )) > -1 )
+ float ascent = 0;
+ float descent = 0;
+ if ( pdFont instanceof PDSimpleFont )
{
- result.append(line.substring(lastIdx,currIdx));
- result.append(" ) Tj\n0 -13 Td\n(");
- lastIdx = currIdx + 1;
+ final PDFontDescriptor fontDescriptor = ((PDSimpleFont) pdFont).getFontDescriptor();
+
+ ascent = fontDescriptor.getAscent();
+ descent = -fontDescriptor.getDescent();
}
- result.append(line.substring(lastIdx));
- return result.toString();
+
+ float yDirection = fontSize + (ascent / 1000) + (descent / 1000);
+ value = NEW_LINE.matcher(value).replaceAll(") Tj\n0 -" + yDirection + " Td\n(");
}
/**
@@ -539,40 +559,135 @@
int fontIndex = daTokens.indexOf( PDFOperator.getOperator( "Tf" ) );
if(fontIndex != -1 )
{
- fontSize = ((COSNumber)daTokens.get(fontIndex-1)).floatValue();
+ fontSize = ((COSNumber) daTokens.get(fontIndex-1)).floatValue();
}
}
-
- float widthBasedFontSize = Float.MAX_VALUE;
-
- if( parent.doNotScroll() )
+
+ if(fontSize == 0)
{
- //if we don't scroll then we will shrink the font to fit into the text area.
- float widthAtFontSize1 = pdFont.getStringWidth( value )/1000.f;
- float availableWidth = getAvailableWidth(boundingBox, getLineWidth(tokens));
- widthBasedFontSize = availableWidth / widthAtFontSize1;
+ // 0 means use auto-sizing
+ // "A zero value for size means that the font shall be auto-sized: its size shall be computed as a
+ // function of the height of the annotation rectangle." - from http://www.adobe.com/devnet/pdf/pdf_reference.html
+ if( parent.isMultiline() )
+ {
+ fontSize = calculateAutoSizedMultiBoxFont(pdFont, boundingBox, tokens);
+ }
+ else
+ {
+ fontSize = calculateAutoSizedTextBoxFont(pdFont, boundingBox, tokens);
+ }
+
}
- else if( fontSize == 0 )
+
+ if( parent.isMultiline() )
{
- float lineWidth = getLineWidth( tokens );
- float stringWidth = pdFont.getStringWidth( value );
- float height = 0;
- if( pdFont instanceof PDSimpleFont )
+ convertToMultiLine(pdFont, fontSize);
+ }
+
+ return fontSize;
+ }
+
+ /**
+ * Calculate the size a font should be for a normal text box.
+ * @return Font size that should be used to fit the text in the text box.
+ * @throws IOException Thrown if an error occurs during parsing.
+ */
+ private float calculateAutoSizedTextBoxFont( PDFont pdFont, PDRectangle boundingBox, List tokens ) throws IOException {
+ float lineWidth = getLineWidth(tokens);
+ float height = 0;
+ if( pdFont instanceof PDSimpleFont )
+ {
+ height = ((PDSimpleFont) pdFont).getFontDescriptor().getFontBoundingBox().getHeight();
+ }
+ else
+ {
+ //now much we can do, so lets assume font is square and use width
+ //as the height
+ height = pdFont.getAverageFontWidth();
+ }
+ height = height/1000f;
+
+ float availHeight = getAvailableHeight( boundingBox, lineWidth );
+ return availHeight/height;
+ }
+
+ /**
+ * Calculate the size a font should be for a multiline text box.
+ * This method will try and keep all the contents inside the bounding box, ignoring whether the field should
+ * scroll or not.
+ * @return Font size that should be used to fit the text in the text box.
+ * @throws IOException Thrown if an error occurs during parsing.
+ */
+ private float calculateAutoSizedMultiBoxFont( PDFont pdFont, PDRectangle boundingBox, List tokens ) throws IOException {
+
+ // Make sure that the lines are suitably short
+ insertWordWrapPoints(boundingBox, pdFont);
+
+ String[] lines = value.split("\n");
+ float widthAtFontSize1 = Float.MIN_VALUE;
+
+ final float newLineWidth = pdFont.getStringWidth("\n") / 1000.f;
+ for (String line : lines)
+ {
+ final float lineWidth = pdFont.getStringWidth(line) / 1000.f;
+ widthAtFontSize1 = Math.max(lineWidth + newLineWidth, widthAtFontSize1);
+ }
+
+ float height = 0;
+ if( pdFont instanceof PDSimpleFont )
+ {
+ height = ((PDSimpleFont) pdFont).getFontDescriptor().getFontBoundingBox().getHeight();
+ }
+ else
+ {
+ //not much we can do, so lets assume font is square and use width
+ //as the height
+ height = pdFont.getAverageFontWidth();
+ }
+ height = (height/1000f) * lines.length;
+
+ float availableWidth = getAvailableWidth(boundingBox, getLineWidth(tokens));
+ float widthBasedFontSize = availableWidth / widthAtFontSize1;
+
+ float lineWidth = getLineWidth( tokens );
+ float availHeight = getAvailableHeight( boundingBox, lineWidth );
+ float heightBasedFontSize = availHeight / height;
+
+ return min(12, heightBasedFontSize, widthBasedFontSize);
+ }
+
+ /**
+ * Put word wrapping points (new line characters) into the value so that long lines don't shrink the text down.
+ * Note: This should only be used for multiline boxes
+ */
+ private void insertWordWrapPoints(PDRectangle boundingBox, PDFont font)
+ {
+ final float maximumWidth = boundingBox.getWidth() * 160;
+
+ StringBuilder wrapped = new StringBuilder(value.length());
+ float distanceSinceNewLine = 0;
+
+ for (char c : value.toCharArray())
+ {
+ if (c == '\n')
{
- height = ((PDSimpleFont)pdFont).getFontDescriptor().getFontBoundingBox().getHeight();
+ distanceSinceNewLine = 0;
}
else
{
- //now much we can do, so lets assume font is square and use width
- //as the height
- height = pdFont.getAverageFontWidth();
+ distanceSinceNewLine += font.getFontWidth(c);
+ if ( distanceSinceNewLine > maximumWidth && Character.isWhitespace(c) )
+ {
+ wrapped.append('\n');
+ distanceSinceNewLine = 0;
+ continue;
+ }
}
- height = height/1000f;
- float availHeight = getAvailableHeight( boundingBox, lineWidth );
- fontSize = Math.min((availHeight/height), widthBasedFontSize);
+ wrapped.append(c);
}
- return fontSize;
+
+ value = wrapped.toString();
}
/**
@@ -587,34 +702,35 @@
private String getTextPosition( PDRectangle boundingBox, PDFont pdFont, float fontSize, List tokens )
throws IOException
{
- float lineWidth = getLineWidth( tokens );
+ if( !( pdFont instanceof PDSimpleFont ) )
+ {
+ throw new IOException( "Error: Don't know how to calculate the position for non-simple fonts" );
+ }
+
float pos = 0.0f;
- if(parent.isMultiline())
+ PDFontDescriptor fd = ((PDSimpleFont) pdFont).getFontDescriptor();
+ if( parent.isMultiline() )
{
- int rows = (int) (getAvailableHeight( boundingBox, lineWidth ) / ((int) fontSize));
- pos = ((rows)*fontSize)-fontSize;
+ // Note - this algorithm was copied from the non-multiline part and tweaked till it looked okay...
+ float bBoxHeight = boundingBox.getHeight();
+ float fontHeight = fd.getFontBoundingBox().getHeight() + 2 * fd.getDescent();
+ fontHeight = (fontHeight/1000) * fontSize;
+ pos = bBoxHeight - (fontHeight * 2.5f);
+
}
else
{
- if( pdFont instanceof PDSimpleFont )
- {
- //BJL 9/25/2004
- //This algorithm is a little bit of black magic. It does
- //not appear to be documented anywhere. Through examining a few
- //PDF documents and the value that Acrobat places in there I
- //have determined that the below method of computing the position
- //is correct for certain documents, but maybe not all. It does
- //work f1040ez.pdf and Form_1.pdf
- PDFontDescriptor fd = ((PDSimpleFont)pdFont).getFontDescriptor();
- float bBoxHeight = boundingBox.getHeight();
- float fontHeight = fd.getFontBoundingBox().getHeight() + 2 * fd.getDescent();
- fontHeight = (fontHeight/1000) * fontSize;
- pos = (bBoxHeight - fontHeight)/2;
- }
- else
- {
- throw new IOException( "Error: Don't know how to calculate the position for non-simple fonts" );
- }
+ //BJL 9/25/2004
+ //This algorithm is a little bit of black magic. It does
+ //not appear to be documented anywhere. Through examining a few
+ //PDF documents and the value that Acrobat places in there I
+ //have determined that the below method of computing the position
+ //is correct for certain documents, but maybe not all. It does
+ //work f1040ez.pdf and Form_1.pdf
+ float bBoxHeight = boundingBox.getHeight();
+ float fontHeight = fd.getFontBoundingBox().getHeight() + 2 * fd.getDescent();
+ fontHeight = (fontHeight/1000) * fontSize;
+ pos = (bBoxHeight - fontHeight)/2;
}
PDRectangle innerBox = getSmallestDrawnRectangle( boundingBox, tokens );
float xInset = 2+ 2*(boundingBox.getWidth() - innerBox.getWidth());
@@ -638,4 +754,12 @@
{
return boundingBox.getHeight() - 2 * lineWidth;
}
+
+ private float min(float... floats) {
+ float min = Float.MAX_VALUE;
+ for (float f : floats) {
+ min = Math.min(min, f);
+ }
+ return min;
+ }
}