Hey everyone,
I just started using PDFBox. However, when I try to get the text from a
large PDF file (100-300 pages), an exception occurs (although the text is
outputted). Could someone tell me the reason of this error? Thank you
I added the code below. Note that "test.pdf" is a 5mb ebook with lots of
text and images. In addition no exception occurs if I strip a smaller number
of pages (the first 10 pages for example)
The code is:
PDDocument document = PDDocument.load("test.pdf");
PDFTextStripper stripper = null;
try
{
stripper = new PDFTextStripper();
} catch (IOException e)
{
e.printStackTrace();
}
stripper.setStartPage(1);
String text = stripper.getText(document);
The Exception is:
Aug 4, 2010 12:40:29 PM org.apache.pdfbox.pdmodel.font.PDFontFactory
createFont
WARNING: Failed to create Type1C font. Falling back to Type1 font
java.lang.IndexOutOfBoundsException: Index: 2,Size: 2
at java.util.SubList.rangeCheck(AbstractList.java:746)
at java.util.SubList.get(AbstractList.java:619)
at
org.apache.fontbox.cff.CharStringConverter.drawAlternatingCurve(CharStringConverter.java:306)
at
org.apache.fontbox.cff.CharStringConverter.handleType1Command(CharStringConverter.java:137)
at
org.apache.fontbox.cff.CharStringConverter.handleCommand(CharStringConverter.java:72)
at
org.apache.fontbox.cff.CharStringHandler.handleSequence(CharStringHandler.java:46)
at
org.apache.fontbox.cff.CharStringConverter.convert(CharStringConverter.java:59)
at
org.apache.fontbox.cff.CFFFont$Mapping.toType1Sequence(CFFFont.java:338)
at org.apache.fontbox.cff.AFMFormatter.renderFont(AFMFormatter.java:126)
at
org.apache.fontbox.cff.AFMFormatter.printFontMetrics(AFMFormatter.java:64)
at org.apache.fontbox.cff.AFMFormatter.printFont(AFMFormatter.java:57)
at org.apache.fontbox.cff.AFMFormatter.format(AFMFormatter.java:50)
at
org.apache.pdfbox.pdmodel.font.PDType1CFont.prepareFontMetric(PDType1CFont.java:530)
at
org.apache.pdfbox.pdmodel.font.PDType1CFont.load(PDType1CFont.java:404)
at
org.apache.pdfbox.pdmodel.font.PDType1CFont.<init>(PDType1CFont.java:123)
at
org.apache.pdfbox.pdmodel.font.PDFontFactory.createFont(PDFontFactory.java:124)
at
org.apache.pdfbox.pdmodel.font.PDFontFactory.createFont(PDFontFactory.java:76)
at org.apache.pdfbox.pdmodel.PDResources.getFonts(PDResources.java:115)
at
org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:227)
at
org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:208)
at
org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:378)
at
org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:302)
at
org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:258)
at
org.apache.pdfbox.util.PDFTextStripper.getText(PDFTextStripper.java:184)
at test.TestPDF.main(TestPDF.java:32)
Aug 4, 2010 12:40:29 PM org.apache.pdfbox.pdmodel.font.PDFontFactory
createFont
WARNING: Failed to create Type1C font. Falling back to Type1 font
java.lang.NullPointerException
at
org.apache.fontbox.cff.CharStringRenderer.rrcurveTo(CharStringRenderer.java:299)
at
org.apache.fontbox.cff.CharStringRenderer.handleCommandType1(CharStringRenderer.java:239)
at
org.apache.fontbox.cff.CharStringRenderer.handleCommand(CharStringRenderer.java:69)
at
org.apache.fontbox.cff.CharStringHandler.handleSequence(CharStringHandler.java:46)
at
org.apache.fontbox.cff.CharStringRenderer.render(CharStringRenderer.java:59)
at org.apache.fontbox.cff.AFMFormatter.renderFont(AFMFormatter.java:126)
at
org.apache.fontbox.cff.AFMFormatter.printFontMetrics(AFMFormatter.java:64)
at org.apache.fontbox.cff.AFMFormatter.printFont(AFMFormatter.java:57)
at org.apache.fontbox.cff.AFMFormatter.format(AFMFormatter.java:50)
at
org.apache.pdfbox.pdmodel.font.PDType1CFont.prepareFontMetric(PDType1CFont.java:530)
at
org.apache.pdfbox.pdmodel.font.PDType1CFont.load(PDType1CFont.java:404)
at
org.apache.pdfbox.pdmodel.font.PDType1CFont.<init>(PDType1CFont.java:123)
at
org.apache.pdfbox.pdmodel.font.PDFontFactory.createFont(PDFontFactory.java:124)
at
org.apache.pdfbox.pdmodel.font.PDFontFactory.createFont(PDFontFactory.java:76)
at org.apache.pdfbox.pdmodel.PDResources.getFonts(PDResources.java:115)
at
org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:227)
at
org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:208)
at
org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:378)
at
org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:302)
at
org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:258)
at
org.apache.pdfbox.util.PDFTextStripper.getText(PDFTextStripper.java:184)
at test.TestPDF.main(TestPDF.java:32)
Aug 4, 2010 12:40:29 PM org.apache.pdfbox.pdmodel.font.PDFontFactory
createFont
WARNING: Failed to create Type1C font. Falling back to Type1 font
java.lang.IndexOutOfBoundsException: Index: 1,Size: 1
at java.util.SubList.rangeCheck(AbstractList.java:746)
at java.util.SubList.get(AbstractList.java:619)
at
org.apache.fontbox.cff.CharStringConverter.drawAlternatingCurve(CharStringConverter.java:313)
at
org.apache.fontbox.cff.CharStringConverter.handleType1Command(CharStringConverter.java:137)
at
org.apache.fontbox.cff.CharStringConverter.handleCommand(CharStringConverter.java:72)
at
org.apache.fontbox.cff.CharStringHandler.handleSequence(CharStringHandler.java:46)
at
org.apache.fontbox.cff.CharStringConverter.convert(CharStringConverter.java:59)
at
org.apache.fontbox.cff.CFFFont$Mapping.toType1Sequence(CFFFont.java:338)
at org.apache.fontbox.cff.AFMFormatter.renderFont(AFMFormatter.java:126)
at
org.apache.fontbox.cff.AFMFormatter.printFontMetrics(AFMFormatter.java:64)
at org.apache.fontbox.cff.AFMFormatter.printFont(AFMFormatter.java:57)
at org.apache.fontbox.cff.AFMFormatter.format(AFMFormatter.java:50)
at
org.apache.pdfbox.pdmodel.font.PDType1CFont.prepareFontMetric(PDType1CFont.java:530)
at
org.apache.pdfbox.pdmodel.font.PDType1CFont.load(PDType1CFont.java:404)
at
org.apache.pdfbox.pdmodel.font.PDType1CFont.<init>(PDType1CFont.java:123)
at
org.apache.pdfbox.pdmodel.font.PDFontFactory.createFont(PDFontFactory.java:124)
at
org.apache.pdfbox.pdmodel.font.PDFontFactory.createFont(PDFontFactory.java:76)
at org.apache.pdfbox.pdmodel.PDResources.getFonts(PDResources.java:115)
at
org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:227)
at
org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:208)
at
org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:378)
at
org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:302)
at
org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:258)
at
org.apache.pdfbox.util.PDFTextStripper.getText(PDFTextStripper.java:184)
at test.TestPDF.main(TestPDF.java:32)