Here's some code for you to try. It should work with any fine rotation,
I have tested it on 2 files LOL. Please tell if it works for you.
/**
*
* @author Tilman Hausherr
*/
public class ExtractAngledText
{
Set<Integer> angles = new HashSet<>();
public Set<Integer> getAngles()
{
return angles;
}
/**
* This will print the documents data.
*
* @param args The command line arguments.
*
* @throws IOException If there is an error parsing the document.
*/
public static void main(String[] args) throws IOException
{
args = new String[]{"PDFBOX-4368-many-rotations.pdf"};
if (args.length != 1)
{
usage();
}
else
{
AngleCollector angleCollector = new AngleCollector();
try (PDDocument doc = PDDocument.load(new File(args[0])))
{
for (int p = 1; p <= doc.getNumberOfPages(); ++p)
{
System.out.printf("Page: %3d\n", p);
System.out.println("----------");
angleCollector.setStartPage(p);
angleCollector.setEndPage(p);
angleCollector.getText(doc);
System.out.println("Collected angles: " +
angleCollector.getAngles());
System.out.println();
PDPage page = doc.getPage(0);
FilteredTextStripper filteredTextStripper = new
FilteredTextStripper();
for (int angle : angleCollector.getAngles())
{
filteredTextStripper.setStartPage(p);
filteredTextStripper.setEndPage(p);
System.out.printf("Angle: %3d\n", angle);
System.out.println("----------");
String text;
if (angle == 0)
{
text = filteredTextStripper.getText(doc);
}
else
{
// prepend a transformation
try (PDPageContentStream cs = new
PDPageContentStream(doc, page, AppendMode.PREPEND, false))
{
cs.transform(Matrix.getRotateInstance(-Math.toRadians(angle), 0, 0));
}
text = filteredTextStripper.getText(doc);
// remove transformation
COSArray contents = (COSArray)
page.getCOSObject().getItem(COSName.CONTENTS);
contents.remove(0);
}
System.out.println(text);
}
}
}
}
}
/**
* This will print the usage for this document.
*/
private static void usage()
{
System.err.println("Usage: java " +
AngleCollector.class.getName() + " <input-pdf>");
}
}
class AngleCollector extends PDFTextStripper
{
Set<Integer> angles = new HashSet<>();
public Set<Integer> getAngles()
{
return angles;
}
/**
* Instantiate a new PDFTextStripper object.
*
* @throws IOException If there is an error loading the properties.
*/
public AngleCollector() throws IOException
{
}
@Override
protected void processTextPosition(TextPosition text)
{
Matrix m = text.getTextMatrix();
int angle = (int)
Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
angles.add(angle);
}
}
class FilteredTextStripper extends PDFTextStripper
{
public FilteredTextStripper() throws IOException
{
}
@Override
protected void processTextPosition(TextPosition text)
{
Matrix m = text.getTextMatrix();
int angle = (int)
Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
if (angle == 0)
{
super.processTextPosition(text);
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]