[
https://issues.apache.org/jira/browse/PDFBOX-3879?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
sun pengrui updated PDFBOX-3879:
--------------------------------
Description:
I'm trying to extract text from a PDF file, and save it to a XML file.
The PDF file includes italic and strikethrough font, I cannot get it with
PDFont class.
Below is the code and result.
{code:java}
public class TextExtractor extends PDFTextStripper {
private static final Log LOG = LogFactory.getLog(PDFTextStripper.class);
private final HashMap<TextPosition, String> colors = new HashMap<>();
public TextExtractor() throws IOException {
addOperator(new SetNonStrokingColorSpace());
addOperator(new SetNonStrokingDeviceCMYKColor());
addOperator(new SetNonStrokingDeviceRGBColor());
addOperator(new SetNonStrokingDeviceGrayColor());
addOperator(new SetNonStrokingColor());
addOperator(new SetNonStrokingColorN());
}
@Override
protected void startDocument(PDDocument document) throws IOException {
super.startDocument(document);
super.writeString("<?xml version=\"1.0\"
encoding=\"UTF-8\"?>\n<document>\n");
}
@Override
protected void endDocument(PDDocument document) throws IOException {
super.endDocument(document);
super.writeString("</document>\n");
}
@Override
protected void startPage(PDPage page) throws IOException {
super.startPage(page);
super.writeString(String.format(" <page width=\"%f\"
height=\"%f\">\n", page.getBBox().getWidth(), page.getBBox().getHeight()));
}
@Override
protected void endPage(PDPage page) throws IOException {
super.endPage(page);
super.writeString(" </page>\n");
}
@Override
protected void processTextPosition(TextPosition text) {
super.processTextPosition(text);
PDColor nonStrokingColor = getGraphicsState().getNonStrokingColor();
try {
String hex = Integer.toHexString(nonStrokingColor.toRGB() &
0xffffff);
while (hex.length() < 6) {
hex = "0" + hex;
}
colors.put(text, "#" + hex);
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
protected void writeString(String string, List<TextPosition> textPositions)
throws IOException {
StringBuilder builder = new StringBuilder(" <line>\n");
String[] words = string.split(this.getWordSeparator());
int startIndex = 0;
for (String word : words) {
if(Strings.isNullOrEmpty(word)){
continue;
}
TextPosition startPosition = textPositions.get(startIndex);
String color = colors.get(startPosition);
String font = startPosition.getFont().getName();
float fontSize = startPosition.getFontSize();
float x = startPosition.getX();
float y = startPosition.getY();
TextPosition endPosition = textPositions.get(startIndex +
word.length() - 1);
float width = endPosition.getEndX() - startPosition.getX();
float height = startPosition.getHeight();
String template =" <word x=\"%f\" y=\"%f\" width=\"%f\"
height=\"%f\" font=\"%s\" font-size=\"%.0f\" color=\"%s\">%s</word>\n";
builder.append(String.format(template, x, y, width, height, font,
fontSize, color, escape(word)));
startIndex += word.length() + 1;
}
builder.append(" </line>");
super.writeString(builder.toString());
}
/**
* Escape some HTML characters.
*
* @param chars String to be escaped
* @return returns escaped String.
*/
private static String escape(String chars)
{
StringBuilder builder = new StringBuilder(chars.length());
for (int i = 0; i < chars.length(); i++)
{
appendEscaped(builder, chars.charAt(i));
}
return builder.toString();
}
private static void appendEscaped(StringBuilder builder, char character)
{
// write non-ASCII as named entities
if ((character < 32) || (character > 126))
{
int charAsInt = character;
builder.append("&#").append(charAsInt).append(";");
}
else
{
switch (character)
{
case 34:
builder.append(""");
break;
case 38:
builder.append("&");
break;
case 60:
builder.append("<");
break;
case 62:
builder.append(">");
break;
default:
builder.append(String.valueOf(character));
}
}
}
}
{code}
{code:xml}
<document>
<page width="595.000000" height="842.000000">
<line>
<word x="48.000000" y="89.000000" width="59.843376" height="20.234375"
font="LucidaGrande" font-size="28" color="#000000">Title</word>
</line>
<line>
<word x="48.000000" y="139.000000" width="32.190125" height="10.562654"
font="LucidaGrande" font-size="14" color="#000000">Italic</word>
</line>
<line>
<word x="48.000000" y="175.000000" width="31.480873" height="10.117188"
font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
<word x="84.171875" y="175.000000" width="26.590248" height="10.117188"
font="LucidaGrande-Bold" font-size="14" color="#000000">and</word>
<word x="115.453125" y="175.000000" width="39.458496" height="10.562654"
font="LucidaGrande-Bold" font-size="14" color="#000000">Italic.</word>
</line>
<line>
<word x="48.000000" y="211.000000" width="31.480873" height="10.117188"
font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
</line>
<line>
<word x="48.000000" y="247.000000" width="92.764618" height="10.117188"
font="LucidaGrande" font-size="14" color="#000000">Strikethrough</word>
</line>
<line>
<word x="48.000000" y="283.000000" width="36.523254" height="10.117188"
font="LucidaGrande" font-size="14" color="#000000">some</word>
<word x="89.000000" y="283.000000" width="26.803375" height="10.117188"
font="LucidaGrande" font-size="14" color="#000000">text</word>
</line>
<line>
<word x="48.000000" y="319.000000" width="27.180374" height="10.117188"
font="LucidaGrande" font-size="14" color="#000000">new</word>
<word x="79.687500" y="319.000000" width="24.523247" height="10.117188"
font="LucidaGrande" font-size="14" color="#000000">line</word>
<word x="108.687500" y="319.000000" width="25.350250" height="10.117188"
font="LucidaGrande" font-size="14" color="#000000">test</word>
</line>
</page>
</document>
{code}
was:
I'm trying to extract text from a PDF file, and save it to a XML file.
The PDF file includes italic and strikethrough font, I cannot get it with
PDFont class.
Below is the result.
{code:xml}
<document>
<page width="595.000000" height="842.000000">
<line>
<word x="48.000000" y="89.000000" width="59.843376" height="20.234375"
font="LucidaGrande" font-size="28" color="#000000">Title</word>
</line>
<line>
<word x="48.000000" y="139.000000" width="32.190125" height="10.562654"
font="LucidaGrande" font-size="14" color="#000000">Italic</word>
</line>
<line>
<word x="48.000000" y="175.000000" width="31.480873" height="10.117188"
font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
<word x="84.171875" y="175.000000" width="26.590248" height="10.117188"
font="LucidaGrande-Bold" font-size="14" color="#000000">and</word>
<word x="115.453125" y="175.000000" width="39.458496" height="10.562654"
font="LucidaGrande-Bold" font-size="14" color="#000000">Italic.</word>
</line>
<line>
<word x="48.000000" y="211.000000" width="31.480873" height="10.117188"
font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
</line>
<line>
<word x="48.000000" y="247.000000" width="92.764618" height="10.117188"
font="LucidaGrande" font-size="14" color="#000000">Strikethrough</word>
</line>
<line>
<word x="48.000000" y="283.000000" width="36.523254" height="10.117188"
font="LucidaGrande" font-size="14" color="#000000">some</word>
<word x="89.000000" y="283.000000" width="26.803375" height="10.117188"
font="LucidaGrande" font-size="14" color="#000000">text</word>
</line>
<line>
<word x="48.000000" y="319.000000" width="27.180374" height="10.117188"
font="LucidaGrande" font-size="14" color="#000000">new</word>
<word x="79.687500" y="319.000000" width="24.523247" height="10.117188"
font="LucidaGrande" font-size="14" color="#000000">line</word>
<word x="108.687500" y="319.000000" width="25.350250" height="10.117188"
font="LucidaGrande" font-size="14" color="#000000">test</word>
</line>
</page>
</document>
{code}
> Not able to get font styles, like italic and Strikethrough
> ----------------------------------------------------------
>
> Key: PDFBOX-3879
> URL: https://issues.apache.org/jira/browse/PDFBOX-3879
> Project: PDFBox
> Issue Type: Bug
> Components: Text extraction
> Affects Versions: 2.0.7
> Reporter: sun pengrui
> Attachments: src.pdf
>
>
> I'm trying to extract text from a PDF file, and save it to a XML file.
> The PDF file includes italic and strikethrough font, I cannot get it with
> PDFont class.
> Below is the code and result.
> {code:java}
> public class TextExtractor extends PDFTextStripper {
> private static final Log LOG = LogFactory.getLog(PDFTextStripper.class);
> private final HashMap<TextPosition, String> colors = new HashMap<>();
> public TextExtractor() throws IOException {
> addOperator(new SetNonStrokingColorSpace());
> addOperator(new SetNonStrokingDeviceCMYKColor());
> addOperator(new SetNonStrokingDeviceRGBColor());
> addOperator(new SetNonStrokingDeviceGrayColor());
> addOperator(new SetNonStrokingColor());
> addOperator(new SetNonStrokingColorN());
> }
> @Override
> protected void startDocument(PDDocument document) throws IOException {
> super.startDocument(document);
> super.writeString("<?xml version=\"1.0\"
> encoding=\"UTF-8\"?>\n<document>\n");
> }
> @Override
> protected void endDocument(PDDocument document) throws IOException {
> super.endDocument(document);
> super.writeString("</document>\n");
> }
> @Override
> protected void startPage(PDPage page) throws IOException {
> super.startPage(page);
> super.writeString(String.format(" <page width=\"%f\"
> height=\"%f\">\n", page.getBBox().getWidth(), page.getBBox().getHeight()));
> }
> @Override
> protected void endPage(PDPage page) throws IOException {
> super.endPage(page);
> super.writeString(" </page>\n");
> }
> @Override
> protected void processTextPosition(TextPosition text) {
> super.processTextPosition(text);
> PDColor nonStrokingColor = getGraphicsState().getNonStrokingColor();
> try {
> String hex = Integer.toHexString(nonStrokingColor.toRGB() &
> 0xffffff);
> while (hex.length() < 6) {
> hex = "0" + hex;
> }
> colors.put(text, "#" + hex);
> } catch (IOException e) {
> e.printStackTrace();
> }
> }
> @Override
> protected void writeString(String string, List<TextPosition>
> textPositions) throws IOException {
> StringBuilder builder = new StringBuilder(" <line>\n");
> String[] words = string.split(this.getWordSeparator());
> int startIndex = 0;
> for (String word : words) {
> if(Strings.isNullOrEmpty(word)){
> continue;
> }
> TextPosition startPosition = textPositions.get(startIndex);
> String color = colors.get(startPosition);
> String font = startPosition.getFont().getName();
> float fontSize = startPosition.getFontSize();
> float x = startPosition.getX();
> float y = startPosition.getY();
> TextPosition endPosition = textPositions.get(startIndex +
> word.length() - 1);
> float width = endPosition.getEndX() - startPosition.getX();
> float height = startPosition.getHeight();
> String template =" <word x=\"%f\" y=\"%f\" width=\"%f\"
> height=\"%f\" font=\"%s\" font-size=\"%.0f\" color=\"%s\">%s</word>\n";
> builder.append(String.format(template, x, y, width, height, font,
> fontSize, color, escape(word)));
> startIndex += word.length() + 1;
> }
> builder.append(" </line>");
> super.writeString(builder.toString());
> }
> /**
> * Escape some HTML characters.
> *
> * @param chars String to be escaped
> * @return returns escaped String.
> */
> private static String escape(String chars)
> {
> StringBuilder builder = new StringBuilder(chars.length());
> for (int i = 0; i < chars.length(); i++)
> {
> appendEscaped(builder, chars.charAt(i));
> }
> return builder.toString();
> }
> private static void appendEscaped(StringBuilder builder, char character)
> {
> // write non-ASCII as named entities
> if ((character < 32) || (character > 126))
> {
> int charAsInt = character;
> builder.append("&#").append(charAsInt).append(";");
> }
> else
> {
> switch (character)
> {
> case 34:
> builder.append(""");
> break;
> case 38:
> builder.append("&");
> break;
> case 60:
> builder.append("<");
> break;
> case 62:
> builder.append(">");
> break;
> default:
> builder.append(String.valueOf(character));
> }
> }
> }
> }
> {code}
> {code:xml}
> <document>
> <page width="595.000000" height="842.000000">
> <line>
> <word x="48.000000" y="89.000000" width="59.843376" height="20.234375"
> font="LucidaGrande" font-size="28" color="#000000">Title</word>
> </line>
> <line>
> <word x="48.000000" y="139.000000" width="32.190125" height="10.562654"
> font="LucidaGrande" font-size="14" color="#000000">Italic</word>
> </line>
> <line>
> <word x="48.000000" y="175.000000" width="31.480873" height="10.117188"
> font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
> <word x="84.171875" y="175.000000" width="26.590248" height="10.117188"
> font="LucidaGrande-Bold" font-size="14" color="#000000">and</word>
> <word x="115.453125" y="175.000000" width="39.458496"
> height="10.562654" font="LucidaGrande-Bold" font-size="14"
> color="#000000">Italic.</word>
> </line>
> <line>
> <word x="48.000000" y="211.000000" width="31.480873" height="10.117188"
> font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
> </line>
> <line>
> <word x="48.000000" y="247.000000" width="92.764618" height="10.117188"
> font="LucidaGrande" font-size="14" color="#000000">Strikethrough</word>
> </line>
> <line>
> <word x="48.000000" y="283.000000" width="36.523254" height="10.117188"
> font="LucidaGrande" font-size="14" color="#000000">some</word>
> <word x="89.000000" y="283.000000" width="26.803375" height="10.117188"
> font="LucidaGrande" font-size="14" color="#000000">text</word>
> </line>
> <line>
> <word x="48.000000" y="319.000000" width="27.180374" height="10.117188"
> font="LucidaGrande" font-size="14" color="#000000">new</word>
> <word x="79.687500" y="319.000000" width="24.523247" height="10.117188"
> font="LucidaGrande" font-size="14" color="#000000">line</word>
> <word x="108.687500" y="319.000000" width="25.350250"
> height="10.117188" font="LucidaGrande" font-size="14"
> color="#000000">test</word>
> </line>
> </page>
> </document>
> {code}
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]