[ 
https://issues.apache.org/jira/browse/PDFBOX-2775?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14514758#comment-14514758
 ] 

ASF subversion and git services commented on PDFBOX-2775:
---------------------------------------------------------

Commit 1676361 from [~tilman] in branch 'pdfbox/trunk'
[ https://svn.apache.org/r1676361 ]

PDFBOX-2775: disable ShouldSeparateByBeads in this derived class

> ArrayIndexOutOfBoundsException in PDFTextStripper.processTextPosition()
> -----------------------------------------------------------------------
>
>                 Key: PDFBOX-2775
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-2775
>             Project: PDFBox
>          Issue Type: Bug
>          Components: Text extraction
>    Affects Versions: 2.0.0
>            Reporter: Tilman Hausherr
>         Attachments: jaf-1-150219.pdf
>
>
> Reported by Andrew M. in the user mailing list:
> {code}
> Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException: Array 
> index out of range: 3
>       at java.util.Vector.get(Vector.java:744)
>       at 
> org.apache.pdfbox.text.PDFTextStripper.processTextPosition(PDFTextStripper.java:903)
>       at 
> org.apache.pdfbox.text.PDFTextStripperByArea.processTextPosition(PDFTextStripperByArea.java:132)
>       at 
> org.apache.pdfbox.text.PDFTextStreamEngine.showGlyph(PDFTextStreamEngine.java:229)
>       at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showText(PDFStreamEngine.java:717)
>       at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showTextStrings(PDFStreamEngine.java:627)
>       at 
> org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted.process(ShowTextAdjusted.java:38)
>       at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:829)
>       at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:490)
>       at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:456)
>       at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processPage(PDFStreamEngine.java:167)
>       at 
> org.apache.pdfbox.text.PDFTextStreamEngine.processPage(PDFTextStreamEngine.java:117)
>       at 
> org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:347)
>       at 
> org.apache.pdfbox.text.PDFTextStripperByArea.extractRegions(PDFTextStripperByArea.java:113)
>       at testpdfbox20.ExtractTextError.textFromBox(ExtractTextError.java:25)
>       at testpdfbox20.ExtractTextError.main(ExtractTextError.java:45)
> {code}
> {code}
> public class ExtractTextError
> {
>     static String textFromBox(PDDocument doc, int x, int y, int w, int h, int 
> page)
>             throws IOException
>     {
>         PDFTextStripperByArea stripper = new PDFTextStripperByArea();
>         Rectangle rect = new Rectangle(x, y - h, w, h);
>         stripper.addRegion("region", rect);
>         int pageCount = doc.getDocumentCatalog().getPages().getCount();
>         System.out.println("getting text from page #" + page + " of " + 
> pageCount + " in doc.");
>         if (page <= pageCount)
>         {
>             PDPage pp = doc.getDocumentCatalog().getPages().get(page - 1);
>             stripper.extractRegions(pp);
>             String text = stripper.getTextForRegion("region");
>             System.out.println("text=" + text);
>             return text;
>         }
>         else
>         {
>             return "No page #" + page;
>         }
>     }
>     public static void main(String[] args) throws IOException
>     {
>         PDDocument doc = PDDocument.load(new File("jaf-1-150219.pdf"));
>         textFromBox(doc, 33, 159, 216, 43, 1);
>     }
> }
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to