Here's some code for you to try. It should work with any fine rotation, I have tested it on 2 files LOL. Please tell if it works for you.

/**
 *
 * @author Tilman Hausherr
 */
public class ExtractAngledText
{
    Set<Integer> angles = new HashSet<>();

    public Set<Integer> getAngles()
    {
        return angles;
    }

    /**
     * This will print the documents data.
     *
     * @param args The command line arguments.
     *
     * @throws IOException If there is an error parsing the document.
     */
    public static void main(String[] args) throws IOException
    {
        args = new String[]{"PDFBOX-4368-many-rotations.pdf"};
        if (args.length != 1)
        {
            usage();
        }
        else
        {
            AngleCollector angleCollector = new AngleCollector();
            try (PDDocument doc = PDDocument.load(new File(args[0])))
            {
                for (int p = 1; p <= doc.getNumberOfPages(); ++p)
                {
                    System.out.printf("Page: %3d\n", p);
                    System.out.println("----------");

                    angleCollector.setStartPage(p);
                    angleCollector.setEndPage(p);
                    angleCollector.getText(doc);
                    System.out.println("Collected angles: " + angleCollector.getAngles());
                    System.out.println();

                    PDPage page = doc.getPage(0);
                    FilteredTextStripper filteredTextStripper = new FilteredTextStripper();
                    for (int angle : angleCollector.getAngles())
                    {
                        filteredTextStripper.setStartPage(p);
                        filteredTextStripper.setEndPage(p);

                        System.out.printf("Angle: %3d\n", angle);
                        System.out.println("----------");
                        String text;
                        if (angle == 0)
                        {
                            text = filteredTextStripper.getText(doc);
                        }
                        else
                        {
                            // prepend a transformation
                            try (PDPageContentStream cs = new PDPageContentStream(doc, page, AppendMode.PREPEND, false))
                            {
cs.transform(Matrix.getRotateInstance(-Math.toRadians(angle), 0, 0));
                            }

                            text = filteredTextStripper.getText(doc);

                            // remove transformation
                            COSArray contents = (COSArray) page.getCOSObject().getItem(COSName.CONTENTS);
                            contents.remove(0);
                        }
                        System.out.println(text);
                    }
                }
            }
        }
    }

    /**
     * This will print the usage for this document.
     */
    private static void usage()
    {
        System.err.println("Usage: java " + AngleCollector.class.getName() + " <input-pdf>");
    }
}

class AngleCollector extends PDFTextStripper
{
    Set<Integer> angles = new HashSet<>();

    public Set<Integer> getAngles()
    {
        return angles;
    }

    /**
     * Instantiate a new PDFTextStripper object.
     *
     * @throws IOException If there is an error loading the properties.
     */
    public AngleCollector() throws IOException
    {
    }

    @Override
    protected void processTextPosition(TextPosition text)
    {
        Matrix m = text.getTextMatrix();
        int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
        angles.add(angle);
    }
}

class FilteredTextStripper extends PDFTextStripper
{
    public FilteredTextStripper() throws IOException
    {
    }

    @Override
    protected void processTextPosition(TextPosition text)
    {
        Matrix m = text.getTextMatrix();
        int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
        if (angle == 0)
        {
            super.processTextPosition(text);
        }
    }
}

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to