knguyen     2004/12/06 15:24:53 CET

  Modified files:        (Branch: JAHIA-4-0-BRANCH)
    src/java/org/jahia/utils/fileparsers PDFExtractor.java 
  Log:
  - updated Pdf box extraction with patched pdfbox lib
  
  Revision  Changes    Path
  1.3.2.7   +46 -29    
jahia/src/java/org/jahia/utils/fileparsers/PDFExtractor.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/src/java/org/jahia/utils/fileparsers/PDFExtractor.java.diff?r1=1.3.2.6&r2=1.3.2.7&f=h
  
  
  
  Index: PDFExtractor.java
  ===================================================================
  RCS file: 
/home/cvs/repository/jahia/src/java/org/jahia/utils/fileparsers/Attic/PDFExtractor.java,v
  retrieving revision 1.3.2.6
  retrieving revision 1.3.2.7
  diff -u -r1.3.2.6 -r1.3.2.7
  --- PDFExtractor.java 26 Oct 2004 11:17:39 -0000      1.3.2.6
  +++ PDFExtractor.java 6 Dec 2004 14:24:53 -0000       1.3.2.7
  @@ -4,7 +4,7 @@
   
   import org.jahia.utils.*;
   import org.pdfbox.cos.COSDocument;
  -import org.pdfbox.encryption.DecryptDocument;
  +import org.pdfbox.encryption.DocumentEncryption;
   import org.pdfbox.exceptions.CryptographyException;
   import org.pdfbox.exceptions.InvalidPasswordException;
   import org.pdfbox.pdfparser.PDFParser;
  @@ -24,6 +24,9 @@
       private static org.apache.log4j.Logger logger =
               org.apache.log4j.Logger.getLogger (PDFExtractor.class);
   
  +    private String path = null;
  +    private long lastModifed;
  +
       public PDFExtractor(){
       }
   
  @@ -53,8 +56,9 @@
       public String getContentAsString(String path, long lastModified,
                                        InputStream fileStream,
                                        String charSet) throws Exception {
  +        this.path = path;
  +        this.lastModifed = lastModified;
           String strVal = null;
  -
           if (fileStream != null) {
                Reader pdfReader = null;
                try {
  @@ -72,7 +76,7 @@
                                elapsedTime + "ms.");
                }
                catch (Throwable t) {
  -                 logger.debug(t);
  +                 logger.debug("Error extracting dpdf file " + this.path ,t);
                }
                finally {
                    try {
  @@ -92,38 +96,51 @@
       }
   
       public Reader getPDFReader(InputStream fileStream,
  -                               String charSet) throws IOException {
  +                               String charSet) throws IOException
  +    {
  +        Reader reader = null;
  +        PDDocument pdfDocument = null;
           try {
  -            BufferedInputStream bufFileStream = new 
BufferedInputStream(fileStream);
  -            PDFParser pdfParser = new PDFParser(bufFileStream);
  -            pdfParser.parse();
  -            PDDocument pdfDocument = pdfParser.getPDDocument();
  -            if(pdfDocument.isEncrypted()) {
  -                DecryptDocument decryptor = new DecryptDocument(pdfDocument);
  -                decryptor.decryptDocument("");
  +            pdfDocument = PDDocument.load(fileStream);
  +            if (pdfDocument.isEncrypted()) {
  +                //Just try using the default password and move on
  +                pdfDocument.decrypt("");
               }
  +            //create a tmp output stream with the size of the content.
  +            ByteArrayOutputStream out = new ByteArrayOutputStream();
  +            OutputStreamWriter writer = new OutputStreamWriter(out);
               PDFTextStripper stripper = new PDFTextStripper();
  -            String docText = stripper.getText(pdfDocument);
  -            byte contents[] = docText.getBytes(charSet);
  -
  -            try {
  -                pdfDocument.close();
  -            } catch ( Throwable t ){
  -            }
  -            try {
  -                bufFileStream.close();
  -            } catch ( Throwable t ){
  -            }
  -
  +            stripper.writeText(pdfDocument, writer);
  +            writer.close();
  +            byte[] contents = out.toByteArray();
               if ( charSet != null ){
  -                return new InputStreamReader(new 
ByteArrayInputStream(contents),
  +                reader = new InputStreamReader(new 
ByteArrayInputStream(contents),
                                                charSet);
  +            } else {
  +                reader = new InputStreamReader(new 
ByteArrayInputStream(contents));
  +            }
  +
  +            reader = new InputStreamReader(new 
ByteArrayInputStream(contents));
  +        }
  +        catch( CryptographyException e )
  +        {
  +            throw new IOException( "Error decrypting document(" + this.path 
+ "): " + e );
  +        }
  +        catch( InvalidPasswordException e )
  +        {
  +            //they didn't suppply a password and the default of "" was wrong.
  +            throw new IOException( "Error: The document(" + this.path +
  +                                    ") is encrypted and will not be 
indexed." );
  +        }
  +        finally
  +        {
  +            if( pdfDocument != null )
  +            {   try {
  +                    pdfDocument.close();
  +                }catch ( Throwable t ){
  +                }
               }
  -            return new InputStreamReader(new ByteArrayInputStream(contents));
  -        } catch (CryptographyException e) {
  -            throw new IOException();
  -        } catch (InvalidPasswordException e) {
  -            throw new IOException();
           }
  +        return reader;
       }
   }
  

Reply via email to