knguyen 2004/12/06 15:24:53 CET
Modified files: (Branch: JAHIA-4-0-BRANCH)
src/java/org/jahia/utils/fileparsers PDFExtractor.java
Log:
- updated Pdf box extraction with patched pdfbox lib
Revision Changes Path
1.3.2.7 +46 -29
jahia/src/java/org/jahia/utils/fileparsers/PDFExtractor.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/src/java/org/jahia/utils/fileparsers/PDFExtractor.java.diff?r1=1.3.2.6&r2=1.3.2.7&f=h
Index: PDFExtractor.java
===================================================================
RCS file:
/home/cvs/repository/jahia/src/java/org/jahia/utils/fileparsers/Attic/PDFExtractor.java,v
retrieving revision 1.3.2.6
retrieving revision 1.3.2.7
diff -u -r1.3.2.6 -r1.3.2.7
--- PDFExtractor.java 26 Oct 2004 11:17:39 -0000 1.3.2.6
+++ PDFExtractor.java 6 Dec 2004 14:24:53 -0000 1.3.2.7
@@ -4,7 +4,7 @@
import org.jahia.utils.*;
import org.pdfbox.cos.COSDocument;
-import org.pdfbox.encryption.DecryptDocument;
+import org.pdfbox.encryption.DocumentEncryption;
import org.pdfbox.exceptions.CryptographyException;
import org.pdfbox.exceptions.InvalidPasswordException;
import org.pdfbox.pdfparser.PDFParser;
@@ -24,6 +24,9 @@
private static org.apache.log4j.Logger logger =
org.apache.log4j.Logger.getLogger (PDFExtractor.class);
+ private String path = null;
+ private long lastModifed;
+
public PDFExtractor(){
}
@@ -53,8 +56,9 @@
public String getContentAsString(String path, long lastModified,
InputStream fileStream,
String charSet) throws Exception {
+ this.path = path;
+ this.lastModifed = lastModified;
String strVal = null;
-
if (fileStream != null) {
Reader pdfReader = null;
try {
@@ -72,7 +76,7 @@
elapsedTime + "ms.");
}
catch (Throwable t) {
- logger.debug(t);
+ logger.debug("Error extracting dpdf file " + this.path ,t);
}
finally {
try {
@@ -92,38 +96,51 @@
}
public Reader getPDFReader(InputStream fileStream,
- String charSet) throws IOException {
+ String charSet) throws IOException
+ {
+ Reader reader = null;
+ PDDocument pdfDocument = null;
try {
- BufferedInputStream bufFileStream = new
BufferedInputStream(fileStream);
- PDFParser pdfParser = new PDFParser(bufFileStream);
- pdfParser.parse();
- PDDocument pdfDocument = pdfParser.getPDDocument();
- if(pdfDocument.isEncrypted()) {
- DecryptDocument decryptor = new DecryptDocument(pdfDocument);
- decryptor.decryptDocument("");
+ pdfDocument = PDDocument.load(fileStream);
+ if (pdfDocument.isEncrypted()) {
+ //Just try using the default password and move on
+ pdfDocument.decrypt("");
}
+ //create a tmp output stream with the size of the content.
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ OutputStreamWriter writer = new OutputStreamWriter(out);
PDFTextStripper stripper = new PDFTextStripper();
- String docText = stripper.getText(pdfDocument);
- byte contents[] = docText.getBytes(charSet);
-
- try {
- pdfDocument.close();
- } catch ( Throwable t ){
- }
- try {
- bufFileStream.close();
- } catch ( Throwable t ){
- }
-
+ stripper.writeText(pdfDocument, writer);
+ writer.close();
+ byte[] contents = out.toByteArray();
if ( charSet != null ){
- return new InputStreamReader(new
ByteArrayInputStream(contents),
+ reader = new InputStreamReader(new
ByteArrayInputStream(contents),
charSet);
+ } else {
+ reader = new InputStreamReader(new
ByteArrayInputStream(contents));
+ }
+
+ reader = new InputStreamReader(new
ByteArrayInputStream(contents));
+ }
+ catch( CryptographyException e )
+ {
+ throw new IOException( "Error decrypting document(" + this.path
+ "): " + e );
+ }
+ catch( InvalidPasswordException e )
+ {
+ //they didn't suppply a password and the default of "" was wrong.
+ throw new IOException( "Error: The document(" + this.path +
+ ") is encrypted and will not be
indexed." );
+ }
+ finally
+ {
+ if( pdfDocument != null )
+ { try {
+ pdfDocument.close();
+ }catch ( Throwable t ){
+ }
}
- return new InputStreamReader(new ByteArrayInputStream(contents));
- } catch (CryptographyException e) {
- throw new IOException();
- } catch (InvalidPasswordException e) {
- throw new IOException();
}
+ return reader;
}
}