Irlanda%202009%2028-51pag.pdf) from Tika

Stefano Falconetti (JIRA) Fri, 09 Apr 2010 09:23:13 -0700

    [ 
https://issues.apache.org/jira/browse/PDFBOX-617?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12855456#action_12855456
 ]


Stefano Falconetti commented on PDFBOX-617:
-------------------------------------------

Source of method crashing:


      /**
         * Actually parse the file
         * @throws IOException
         */
        @SuppressWarnings("deprecation")
        //TODO move to the non deprecated call
        public void parse() throws IOException {
                
                try{
                        if(this.myFileToParse != null){
                           this.inputStream = new 
FileInputStream(this.myFileToParse);
                        }else{
                                  this.inputStream = new 
ByteArrayInputStream(this.rawDocument.getContent());
                        }
                        
                        Parser parser = new AutoDetectParser();
                        ContentHandler textHandler = new BodyContentHandler();
                        Metadata metadata = new Metadata();
                        
                        this.documentTitle = null;
                        this.documentKeywords = null;
                        this.documentContent = new String();
                        //TODO Move to a non deprecated call
                        //parser.parse(inputStream, this, metadata, 
parseContext);
                        parser.parse(inputStream, 
                                                 textHandler, 
                                                 metadata);
                        //Get all arrays first
                        String[] tmpDocumentTitle = 
metadata.getValues(Metadata.TITLE);
                        String[] tmpKeywords = 
metadata.getValues(Metadata.KEYWORDS);
                        String[] tmpDocumentDescription = 
metadata.getValues(Metadata.DESCRIPTION); 
                        
                        
//###############################################################
                        // Sequence of utility methods' calls must be: 
                        // stringArrayToString -> cleanUpExtraChars -> 
tokenizeDeTokenize
                        
//###############################################################

                        //Take keywords both from page keywords and description
                        int keywordsNum = 0;
                        int descriptionWordsNum = 0;
                        
                        if(tmpKeywords != null){
                           keywordsNum = tmpKeywords.length;
                        }
                        
                        if(tmpDocumentDescription != null){
                           descriptionWordsNum = tmpDocumentDescription.length;
                        }
                        
                        int allKeywordsNum = keywordsNum + descriptionWordsNum;
                        
                        String[] tmpAllKeywords = null;
                        
                        //From title as last chance
                        if( (allKeywordsNum == 0) &&
                                (tmpDocumentTitle != null) &&
                                (tmpDocumentTitle.length != 0) ){
                           allKeywordsNum = 1;
                           logger.warn("No meta information found, using 
title");
                           tmpAllKeywords = new 
String[]{this.stringArrayToString(tmpDocumentTitle)};
                        }else{
                                tmpAllKeywords = new String[allKeywordsNum]; 
                        
                                  System.arraycopy(tmpKeywords, 
                                                                   0, 
                                                                   
tmpAllKeywords, 
                                                                   0, 
                                                                   
tmpKeywords.length);
                                        
                                  System.arraycopy(tmpDocumentDescription, 
                                                                   0, 
                                                                   
tmpAllKeywords, 
                                                                   
tmpKeywords.length, 
                                                                   
tmpDocumentDescription.length);
                        }
                        //Fill in public getters
                        this.documentTitle = 
this.stringArrayToString(tmpDocumentTitle);
                        this.documentTitle = 
this.cleanUpExtraChars(this.documentTitle);
                        this.documentTitle = 
this.tokenizeDeTokenize(this.documentTitle);
                        
                        this.documentKeywords = 
this.stringArrayToString(tmpAllKeywords);
                        this.documentKeywords = 
this.cleanUpExtraChars(this.documentKeywords);
                        //TODO if this value is needed (5), put it in the 
configuration file
                        this.documentKeywords = 
this.tokenizeDeTokenize(this.documentKeywords, 5).toLowerCase();
                        
                        this.documentDescription = 
this.stringArrayToString(tmpDocumentDescription);
                        this.documentDescription = 
this.cleanUpExtraChars(this.documentDescription);
                        this.documentDescription = 
this.tokenizeDeTokenize(this.documentDescription);
                                                
                        this.documentContent = 
this.cleanUpExtraChars(textHandler.toString().trim());
                                                
                        //#####################################################
                        //### Very special cases of very bad document found ###
                        //#####################################################
                        if((this.documentTitle == null) ||
                           (this.documentTitle.trim().equals(""))){
                                
                                this.documentTitle = 
this.guessTitle(this.documentContent, this.rawDocument.getURL().getHost());     
                                                                                
                     
                        }
                        
                        if((this.documentKeywords == null) ||
                           (this.documentKeywords.trim().equals(""))){
                                this.documentKeywords = 
this.guessKeywords(this.documentContent);
                        }
                        //##############################################à
                        
                        //Semantic checks:
                        //Checking if keywords are appropriate, as being 
present in content also.
                        this.documentKeywords = 
this.contentKeywordsConsistencyCheck(this.documentKeywords, 
                                                                                
                                                             
this.documentContent);
                        
                }catch(FileNotFoundException fnfExc) {
                           throw new IOException(fnfExc);
                }catch(SAXException sExc) {
                           throw new IOException(sExc);
                }catch(TikaException tExc) {
                           throw new IOException(tExc);
                }catch(Exception exc) {
                           throw new IOException(exc);
                }
        }

> Crash parsing pdf file 
> (http://media.opentur.it/WEB/CHANNELS/COCKTAILVIAGGI/CMS/PDF/Irlanda%202009%2028-51pag.pdf)
>  from Tika
> ----------------------------------------------------------------------------------------------------------------------------
>
>                 Key: PDFBOX-617
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-617
>             Project: PDFBox
>          Issue Type: Bug
>          Components: Parsing
>    Affects Versions: 0.8.0-incubator
>         Environment: Linux debian: Linux 2.6.18-6-686 #1 SMP i686 GNU/Linux 
> java version "1.6.0_13"
> Java(TM) SE Runtime Environment (build 1.6.0_13-b03)
> Java HotSpot(TM) Client VM (build 11.3-b02, mixed mode, sharing)
>            Reporter: Stefano Falconetti
>            Priority: Critical
>         Attachments: Irlanda125pag.pdf, Irlanda26-52pag.pdf, 
> Portogallo2010.pdf, StatiUniti2010_1.pdf
>
>
> Parsing the file 
> http://media.opentur.it/WEB/CHANNELS/COCKTAILVIAGGI/CMS/PDF/Irlanda%202009%2028-51pag.pdf
>  the call to Tika "parse" fails with the followinf stack trace:
> java.io.IOException: org.apache.tika.exception.TikaException: TIKA-198: 
> Illegal IOException from org.apache.tika.parser.pdf.pdfpar...@1578aab
>       at 
> com.travelport.indexing.documentparser.GenericDocumentParserTikaImpl.parse(GenericDocumentParserTikaImpl.java:143)
>       at 
> com.travelport.indexing.documentparser.GenericDocumentParserTikaImpl.main(GenericDocumentParserTikaImpl.java:306)
> Caused by: org.apache.tika.exception.TikaException: TIKA-198: Illegal 
> IOException from org.apache.tika.parser.pdf.pdfpar...@1578aab
>       at 
> org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:126)
>       at 
> org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:101)
>       at 
> org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:114)
>       at 
> com.travelport.indexing.documentparser.GenericDocumentParserTikaImpl.parse(GenericDocumentParserTikaImpl.java:69)
>       ... 1 more
> Caused by: org.apache.pdfbox.exceptions.WrappedIOException
>       at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:237)
>       at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:841)
>       at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:808)
>       at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:53)
>       at 
> org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:120)
>       ... 4 more
> Caused by: java.util.NoSuchElementException
>       at java.util.AbstractList$Itr.next(AbstractList.java:350)
>       at 
> org.apache.pdfbox.pdfparser.PDFXrefStreamParser.parse(PDFXrefStreamParser.java:115)
>       at 
> org.apache.pdfbox.cos.COSDocument.parseXrefStreams(COSDocument.java:538)
>       at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:203)
>       ... 8 more

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.

[jira] Commented: (PDFBOX-617) Crash parsing pdf file (http://media.opentur.it/WEB/CHANNELS/COCKTAILVIAGGI/CMS/PDF/Irlanda%202009%2028-51pag.pdf) from Tika

Reply via email to