[ 
https://issues.apache.org/jira/browse/PDFBOX-3879?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

sun pengrui updated PDFBOX-3879:
--------------------------------
    Description: 
I'm trying to extract text from a PDF file, and save it to a XML file. 
The PDF file includes italic and strikethrough font, I cannot get it with 
PDFont class.
Below is the code and result.

{code:java}
public class TextExtractor extends PDFTextStripper {
    private static final Log LOG = LogFactory.getLog(PDFTextStripper.class);

    private final HashMap<TextPosition, String> colors = new HashMap<>();

    public TextExtractor() throws IOException {
        addOperator(new SetNonStrokingColorSpace());
        addOperator(new SetNonStrokingDeviceCMYKColor());
        addOperator(new SetNonStrokingDeviceRGBColor());
        addOperator(new SetNonStrokingDeviceGrayColor());
        addOperator(new SetNonStrokingColor());
        addOperator(new SetNonStrokingColorN());
    }

    @Override
    protected void startDocument(PDDocument document) throws IOException {
        super.startDocument(document);
        super.writeString("<?xml version=\"1.0\" 
encoding=\"UTF-8\"?>\n<document>\n");
    }

    @Override
    protected void endDocument(PDDocument document) throws IOException {
        super.endDocument(document);
        super.writeString("</document>\n");
    }

    @Override
    protected void startPage(PDPage page) throws IOException {
        super.startPage(page);
        super.writeString(String.format("  <page width=\"%f\" 
height=\"%f\">\n", page.getBBox().getWidth(), page.getBBox().getHeight()));
    }

    @Override
    protected void endPage(PDPage page) throws IOException {
        super.endPage(page);
        super.writeString("  </page>\n");

    }

    @Override
    protected void processTextPosition(TextPosition text) {
        super.processTextPosition(text);
        PDColor nonStrokingColor = getGraphicsState().getNonStrokingColor();
        try {
            String hex = Integer.toHexString(nonStrokingColor.toRGB() & 
0xffffff);
            while (hex.length() < 6) {
                hex = "0" + hex;
            }
            colors.put(text, "#" + hex);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    @Override
    protected void writeString(String string, List<TextPosition> textPositions) 
throws IOException {
        StringBuilder builder = new StringBuilder("    <line>\n");
        String[] words = string.split(this.getWordSeparator());
        int startIndex = 0;
        for (String word : words) {
            if(Strings.isNullOrEmpty(word)){
                continue;
            }
            TextPosition startPosition = textPositions.get(startIndex);
            String color = colors.get(startPosition);
            String font = startPosition.getFont().getName();
            float fontSize = startPosition.getFontSize();
            float x = startPosition.getX();
            float y = startPosition.getY();
            TextPosition endPosition = textPositions.get(startIndex + 
word.length() - 1);
            float width = endPosition.getEndX() - startPosition.getX();
            float height = startPosition.getHeight();
            String template ="      <word x=\"%f\" y=\"%f\" width=\"%f\" 
height=\"%f\" font=\"%s\" font-size=\"%.0f\" color=\"%s\">%s</word>\n";
            builder.append(String.format(template, x, y, width, height, font, 
fontSize, color, escape(word)));
            startIndex += word.length() + 1;
        }
        builder.append("    </line>");
        super.writeString(builder.toString());
    }

    /**
     * Escape some HTML characters.
     *
     * @param chars String to be escaped
     * @return returns escaped String.
     */
    private static String escape(String chars)
    {
        StringBuilder builder = new StringBuilder(chars.length());
        for (int i = 0; i < chars.length(); i++)
        {
            appendEscaped(builder, chars.charAt(i));
        }
        return builder.toString();
    }

    private static void appendEscaped(StringBuilder builder, char character)
    {
        // write non-ASCII as named entities
        if ((character < 32) || (character > 126))
        {
            int charAsInt = character;
            builder.append("&#").append(charAsInt).append(";");
        }
        else
        {
            switch (character)
            {
                case 34:
                    builder.append("&quot;");
                    break;
                case 38:
                    builder.append("&amp;");
                    break;
                case 60:
                    builder.append("&lt;");
                    break;
                case 62:
                    builder.append("&gt;");
                    break;
                default:
                    builder.append(String.valueOf(character));
            }
        }
    }
}
{code}


{code:xml}
<document>
  <page width="595.000000" height="842.000000">
    <line>
      <word x="48.000000" y="89.000000" width="59.843376" height="20.234375" 
font="LucidaGrande" font-size="28" color="#000000">Title</word>
    </line>
    <line>
      <word x="48.000000" y="139.000000" width="32.190125" height="10.562654" 
font="LucidaGrande" font-size="14" color="#000000">Italic</word>
    </line>
    <line>
      <word x="48.000000" y="175.000000" width="31.480873" height="10.117188" 
font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
      <word x="84.171875" y="175.000000" width="26.590248" height="10.117188" 
font="LucidaGrande-Bold" font-size="14" color="#000000">and</word>
      <word x="115.453125" y="175.000000" width="39.458496" height="10.562654" 
font="LucidaGrande-Bold" font-size="14" color="#000000">Italic.</word>
    </line>
    <line>
      <word x="48.000000" y="211.000000" width="31.480873" height="10.117188" 
font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
    </line>
    <line>
      <word x="48.000000" y="247.000000" width="92.764618" height="10.117188" 
font="LucidaGrande" font-size="14" color="#000000">Strikethrough</word>
    </line>
    <line>
      <word x="48.000000" y="283.000000" width="36.523254" height="10.117188" 
font="LucidaGrande" font-size="14" color="#000000">some</word>
      <word x="89.000000" y="283.000000" width="26.803375" height="10.117188" 
font="LucidaGrande" font-size="14" color="#000000">text</word>
    </line>
    <line>
      <word x="48.000000" y="319.000000" width="27.180374" height="10.117188" 
font="LucidaGrande" font-size="14" color="#000000">new</word>
      <word x="79.687500" y="319.000000" width="24.523247" height="10.117188" 
font="LucidaGrande" font-size="14" color="#000000">line</word>
      <word x="108.687500" y="319.000000" width="25.350250" height="10.117188" 
font="LucidaGrande" font-size="14" color="#000000">test</word>
    </line>
  </page>
</document>
{code}



  was:
I'm trying to extract text from a PDF file, and save it to a XML file. 
The PDF file includes italic and strikethrough font, I cannot get it with 
PDFont class.
Below is the result.

{code:xml}
<document>
  <page width="595.000000" height="842.000000">
    <line>
      <word x="48.000000" y="89.000000" width="59.843376" height="20.234375" 
font="LucidaGrande" font-size="28" color="#000000">Title</word>
    </line>
    <line>
      <word x="48.000000" y="139.000000" width="32.190125" height="10.562654" 
font="LucidaGrande" font-size="14" color="#000000">Italic</word>
    </line>
    <line>
      <word x="48.000000" y="175.000000" width="31.480873" height="10.117188" 
font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
      <word x="84.171875" y="175.000000" width="26.590248" height="10.117188" 
font="LucidaGrande-Bold" font-size="14" color="#000000">and</word>
      <word x="115.453125" y="175.000000" width="39.458496" height="10.562654" 
font="LucidaGrande-Bold" font-size="14" color="#000000">Italic.</word>
    </line>
    <line>
      <word x="48.000000" y="211.000000" width="31.480873" height="10.117188" 
font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
    </line>
    <line>
      <word x="48.000000" y="247.000000" width="92.764618" height="10.117188" 
font="LucidaGrande" font-size="14" color="#000000">Strikethrough</word>
    </line>
    <line>
      <word x="48.000000" y="283.000000" width="36.523254" height="10.117188" 
font="LucidaGrande" font-size="14" color="#000000">some</word>
      <word x="89.000000" y="283.000000" width="26.803375" height="10.117188" 
font="LucidaGrande" font-size="14" color="#000000">text</word>
    </line>
    <line>
      <word x="48.000000" y="319.000000" width="27.180374" height="10.117188" 
font="LucidaGrande" font-size="14" color="#000000">new</word>
      <word x="79.687500" y="319.000000" width="24.523247" height="10.117188" 
font="LucidaGrande" font-size="14" color="#000000">line</word>
      <word x="108.687500" y="319.000000" width="25.350250" height="10.117188" 
font="LucidaGrande" font-size="14" color="#000000">test</word>
    </line>
  </page>
</document>
{code}




> Not able to get font styles, like italic and Strikethrough
> ----------------------------------------------------------
>
>                 Key: PDFBOX-3879
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-3879
>             Project: PDFBox
>          Issue Type: Bug
>          Components: Text extraction
>    Affects Versions: 2.0.7
>            Reporter: sun pengrui
>         Attachments: src.pdf
>
>
> I'm trying to extract text from a PDF file, and save it to a XML file. 
> The PDF file includes italic and strikethrough font, I cannot get it with 
> PDFont class.
> Below is the code and result.
> {code:java}
> public class TextExtractor extends PDFTextStripper {
>     private static final Log LOG = LogFactory.getLog(PDFTextStripper.class);
>     private final HashMap<TextPosition, String> colors = new HashMap<>();
>     public TextExtractor() throws IOException {
>         addOperator(new SetNonStrokingColorSpace());
>         addOperator(new SetNonStrokingDeviceCMYKColor());
>         addOperator(new SetNonStrokingDeviceRGBColor());
>         addOperator(new SetNonStrokingDeviceGrayColor());
>         addOperator(new SetNonStrokingColor());
>         addOperator(new SetNonStrokingColorN());
>     }
>     @Override
>     protected void startDocument(PDDocument document) throws IOException {
>         super.startDocument(document);
>         super.writeString("<?xml version=\"1.0\" 
> encoding=\"UTF-8\"?>\n<document>\n");
>     }
>     @Override
>     protected void endDocument(PDDocument document) throws IOException {
>         super.endDocument(document);
>         super.writeString("</document>\n");
>     }
>     @Override
>     protected void startPage(PDPage page) throws IOException {
>         super.startPage(page);
>         super.writeString(String.format("  <page width=\"%f\" 
> height=\"%f\">\n", page.getBBox().getWidth(), page.getBBox().getHeight()));
>     }
>     @Override
>     protected void endPage(PDPage page) throws IOException {
>         super.endPage(page);
>         super.writeString("  </page>\n");
>     }
>     @Override
>     protected void processTextPosition(TextPosition text) {
>         super.processTextPosition(text);
>         PDColor nonStrokingColor = getGraphicsState().getNonStrokingColor();
>         try {
>             String hex = Integer.toHexString(nonStrokingColor.toRGB() & 
> 0xffffff);
>             while (hex.length() < 6) {
>                 hex = "0" + hex;
>             }
>             colors.put(text, "#" + hex);
>         } catch (IOException e) {
>             e.printStackTrace();
>         }
>     }
>     @Override
>     protected void writeString(String string, List<TextPosition> 
> textPositions) throws IOException {
>         StringBuilder builder = new StringBuilder("    <line>\n");
>         String[] words = string.split(this.getWordSeparator());
>         int startIndex = 0;
>         for (String word : words) {
>             if(Strings.isNullOrEmpty(word)){
>                 continue;
>             }
>             TextPosition startPosition = textPositions.get(startIndex);
>             String color = colors.get(startPosition);
>             String font = startPosition.getFont().getName();
>             float fontSize = startPosition.getFontSize();
>             float x = startPosition.getX();
>             float y = startPosition.getY();
>             TextPosition endPosition = textPositions.get(startIndex + 
> word.length() - 1);
>             float width = endPosition.getEndX() - startPosition.getX();
>             float height = startPosition.getHeight();
>             String template ="      <word x=\"%f\" y=\"%f\" width=\"%f\" 
> height=\"%f\" font=\"%s\" font-size=\"%.0f\" color=\"%s\">%s</word>\n";
>             builder.append(String.format(template, x, y, width, height, font, 
> fontSize, color, escape(word)));
>             startIndex += word.length() + 1;
>         }
>         builder.append("    </line>");
>         super.writeString(builder.toString());
>     }
>     /**
>      * Escape some HTML characters.
>      *
>      * @param chars String to be escaped
>      * @return returns escaped String.
>      */
>     private static String escape(String chars)
>     {
>         StringBuilder builder = new StringBuilder(chars.length());
>         for (int i = 0; i < chars.length(); i++)
>         {
>             appendEscaped(builder, chars.charAt(i));
>         }
>         return builder.toString();
>     }
>     private static void appendEscaped(StringBuilder builder, char character)
>     {
>         // write non-ASCII as named entities
>         if ((character < 32) || (character > 126))
>         {
>             int charAsInt = character;
>             builder.append("&#").append(charAsInt).append(";");
>         }
>         else
>         {
>             switch (character)
>             {
>                 case 34:
>                     builder.append("&quot;");
>                     break;
>                 case 38:
>                     builder.append("&amp;");
>                     break;
>                 case 60:
>                     builder.append("&lt;");
>                     break;
>                 case 62:
>                     builder.append("&gt;");
>                     break;
>                 default:
>                     builder.append(String.valueOf(character));
>             }
>         }
>     }
> }
> {code}
> {code:xml}
> <document>
>   <page width="595.000000" height="842.000000">
>     <line>
>       <word x="48.000000" y="89.000000" width="59.843376" height="20.234375" 
> font="LucidaGrande" font-size="28" color="#000000">Title</word>
>     </line>
>     <line>
>       <word x="48.000000" y="139.000000" width="32.190125" height="10.562654" 
> font="LucidaGrande" font-size="14" color="#000000">Italic</word>
>     </line>
>     <line>
>       <word x="48.000000" y="175.000000" width="31.480873" height="10.117188" 
> font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
>       <word x="84.171875" y="175.000000" width="26.590248" height="10.117188" 
> font="LucidaGrande-Bold" font-size="14" color="#000000">and</word>
>       <word x="115.453125" y="175.000000" width="39.458496" 
> height="10.562654" font="LucidaGrande-Bold" font-size="14" 
> color="#000000">Italic.</word>
>     </line>
>     <line>
>       <word x="48.000000" y="211.000000" width="31.480873" height="10.117188" 
> font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
>     </line>
>     <line>
>       <word x="48.000000" y="247.000000" width="92.764618" height="10.117188" 
> font="LucidaGrande" font-size="14" color="#000000">Strikethrough</word>
>     </line>
>     <line>
>       <word x="48.000000" y="283.000000" width="36.523254" height="10.117188" 
> font="LucidaGrande" font-size="14" color="#000000">some</word>
>       <word x="89.000000" y="283.000000" width="26.803375" height="10.117188" 
> font="LucidaGrande" font-size="14" color="#000000">text</word>
>     </line>
>     <line>
>       <word x="48.000000" y="319.000000" width="27.180374" height="10.117188" 
> font="LucidaGrande" font-size="14" color="#000000">new</word>
>       <word x="79.687500" y="319.000000" width="24.523247" height="10.117188" 
> font="LucidaGrande" font-size="14" color="#000000">line</word>
>       <word x="108.687500" y="319.000000" width="25.350250" 
> height="10.117188" font="LucidaGrande" font-size="14" 
> color="#000000">test</word>
>     </line>
>   </page>
> </document>
> {code}



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to