[
https://issues.apache.org/jira/browse/PDFBOX-617?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12855456#action_12855456
]
Stefano Falconetti commented on PDFBOX-617:
-------------------------------------------
Source of method crashing:
/**
* Actually parse the file
* @throws IOException
*/
@SuppressWarnings("deprecation")
//TODO move to the non deprecated call
public void parse() throws IOException {
try{
if(this.myFileToParse != null){
this.inputStream = new
FileInputStream(this.myFileToParse);
}else{
this.inputStream = new
ByteArrayInputStream(this.rawDocument.getContent());
}
Parser parser = new AutoDetectParser();
ContentHandler textHandler = new BodyContentHandler();
Metadata metadata = new Metadata();
this.documentTitle = null;
this.documentKeywords = null;
this.documentContent = new String();
//TODO Move to a non deprecated call
//parser.parse(inputStream, this, metadata,
parseContext);
parser.parse(inputStream,
textHandler,
metadata);
//Get all arrays first
String[] tmpDocumentTitle =
metadata.getValues(Metadata.TITLE);
String[] tmpKeywords =
metadata.getValues(Metadata.KEYWORDS);
String[] tmpDocumentDescription =
metadata.getValues(Metadata.DESCRIPTION);
//###############################################################
// Sequence of utility methods' calls must be:
// stringArrayToString -> cleanUpExtraChars ->
tokenizeDeTokenize
//###############################################################
//Take keywords both from page keywords and description
int keywordsNum = 0;
int descriptionWordsNum = 0;
if(tmpKeywords != null){
keywordsNum = tmpKeywords.length;
}
if(tmpDocumentDescription != null){
descriptionWordsNum = tmpDocumentDescription.length;
}
int allKeywordsNum = keywordsNum + descriptionWordsNum;
String[] tmpAllKeywords = null;
//From title as last chance
if( (allKeywordsNum == 0) &&
(tmpDocumentTitle != null) &&
(tmpDocumentTitle.length != 0) ){
allKeywordsNum = 1;
logger.warn("No meta information found, using
title");
tmpAllKeywords = new
String[]{this.stringArrayToString(tmpDocumentTitle)};
}else{
tmpAllKeywords = new String[allKeywordsNum];
System.arraycopy(tmpKeywords,
0,
tmpAllKeywords,
0,
tmpKeywords.length);
System.arraycopy(tmpDocumentDescription,
0,
tmpAllKeywords,
tmpKeywords.length,
tmpDocumentDescription.length);
}
//Fill in public getters
this.documentTitle =
this.stringArrayToString(tmpDocumentTitle);
this.documentTitle =
this.cleanUpExtraChars(this.documentTitle);
this.documentTitle =
this.tokenizeDeTokenize(this.documentTitle);
this.documentKeywords =
this.stringArrayToString(tmpAllKeywords);
this.documentKeywords =
this.cleanUpExtraChars(this.documentKeywords);
//TODO if this value is needed (5), put it in the
configuration file
this.documentKeywords =
this.tokenizeDeTokenize(this.documentKeywords, 5).toLowerCase();
this.documentDescription =
this.stringArrayToString(tmpDocumentDescription);
this.documentDescription =
this.cleanUpExtraChars(this.documentDescription);
this.documentDescription =
this.tokenizeDeTokenize(this.documentDescription);
this.documentContent =
this.cleanUpExtraChars(textHandler.toString().trim());
//#####################################################
//### Very special cases of very bad document found ###
//#####################################################
if((this.documentTitle == null) ||
(this.documentTitle.trim().equals(""))){
this.documentTitle =
this.guessTitle(this.documentContent, this.rawDocument.getURL().getHost());
}
if((this.documentKeywords == null) ||
(this.documentKeywords.trim().equals(""))){
this.documentKeywords =
this.guessKeywords(this.documentContent);
}
//##############################################à
//Semantic checks:
//Checking if keywords are appropriate, as being
present in content also.
this.documentKeywords =
this.contentKeywordsConsistencyCheck(this.documentKeywords,
this.documentContent);
}catch(FileNotFoundException fnfExc) {
throw new IOException(fnfExc);
}catch(SAXException sExc) {
throw new IOException(sExc);
}catch(TikaException tExc) {
throw new IOException(tExc);
}catch(Exception exc) {
throw new IOException(exc);
}
}
> Crash parsing pdf file
> (http://media.opentur.it/WEB/CHANNELS/COCKTAILVIAGGI/CMS/PDF/Irlanda%202009%2028-51pag.pdf)
> from Tika
> ----------------------------------------------------------------------------------------------------------------------------
>
> Key: PDFBOX-617
> URL: https://issues.apache.org/jira/browse/PDFBOX-617
> Project: PDFBox
> Issue Type: Bug
> Components: Parsing
> Affects Versions: 0.8.0-incubator
> Environment: Linux debian: Linux 2.6.18-6-686 #1 SMP i686 GNU/Linux
> java version "1.6.0_13"
> Java(TM) SE Runtime Environment (build 1.6.0_13-b03)
> Java HotSpot(TM) Client VM (build 11.3-b02, mixed mode, sharing)
> Reporter: Stefano Falconetti
> Priority: Critical
> Attachments: Irlanda125pag.pdf, Irlanda26-52pag.pdf,
> Portogallo2010.pdf, StatiUniti2010_1.pdf
>
>
> Parsing the file
> http://media.opentur.it/WEB/CHANNELS/COCKTAILVIAGGI/CMS/PDF/Irlanda%202009%2028-51pag.pdf
> the call to Tika "parse" fails with the followinf stack trace:
> java.io.IOException: org.apache.tika.exception.TikaException: TIKA-198:
> Illegal IOException from org.apache.tika.parser.pdf.pdfpar...@1578aab
> at
> com.travelport.indexing.documentparser.GenericDocumentParserTikaImpl.parse(GenericDocumentParserTikaImpl.java:143)
> at
> com.travelport.indexing.documentparser.GenericDocumentParserTikaImpl.main(GenericDocumentParserTikaImpl.java:306)
> Caused by: org.apache.tika.exception.TikaException: TIKA-198: Illegal
> IOException from org.apache.tika.parser.pdf.pdfpar...@1578aab
> at
> org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:126)
> at
> org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:101)
> at
> org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:114)
> at
> com.travelport.indexing.documentparser.GenericDocumentParserTikaImpl.parse(GenericDocumentParserTikaImpl.java:69)
> ... 1 more
> Caused by: org.apache.pdfbox.exceptions.WrappedIOException
> at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:237)
> at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:841)
> at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:808)
> at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:53)
> at
> org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:120)
> ... 4 more
> Caused by: java.util.NoSuchElementException
> at java.util.AbstractList$Itr.next(AbstractList.java:350)
> at
> org.apache.pdfbox.pdfparser.PDFXrefStreamParser.parse(PDFXrefStreamParser.java:115)
> at
> org.apache.pdfbox.cos.COSDocument.parseXrefStreams(COSDocument.java:538)
> at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:203)
> ... 8 more
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.