knguyen 2005/04/08 16:38:59 CEST
Modified files: (Branch: JAHIA-4-0-BRANCH)
src/java/org/jahia/utils/fileparsers HDFExtractor.java
PDFExtractor.java
Log:
- fix pdf and charset parsing
Revision Changes Path
1.4.4.3 +14 -292
jahia/src/java/org/jahia/utils/fileparsers/HDFExtractor.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/src/java/org/jahia/utils/fileparsers/HDFExtractor.java.diff?r1=1.4.4.2&r2=1.4.4.3&f=h
1.3.2.10 +6 -1
jahia/src/java/org/jahia/utils/fileparsers/PDFExtractor.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/src/java/org/jahia/utils/fileparsers/PDFExtractor.java.diff?r1=1.3.2.9&r2=1.3.2.10&f=h
Index: HDFExtractor.java
===================================================================
RCS file:
/home/cvs/repository/jahia/src/java/org/jahia/utils/fileparsers/Attic/HDFExtractor.java,v
retrieving revision 1.4.4.2
retrieving revision 1.4.4.3
diff -u -r1.4.4.2 -r1.4.4.3
--- HDFExtractor.java 20 Oct 2004 09:46:09 -0000 1.4.4.2
+++ HDFExtractor.java 8 Apr 2005 14:38:58 -0000 1.4.4.3
@@ -1,22 +1,11 @@
package org.jahia.utils.fileparsers;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileWriter;
-import java.io.FilenameFilter;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
+import java.io.*;
import java.util.Iterator;
-import org.apache.poi.hdf.extractor.WordDocument;
-import org.apache.poi.poifs.filesystem.DirectoryEntry;
-import org.apache.poi.poifs.filesystem.DocumentEntry;
-import org.apache.poi.poifs.filesystem.DocumentInputStream;
-import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import java.io.BufferedInputStream;
+import org.apache.slide.extractor.MSWordExtractor;
+import org.jahia.utils.FileUtils;
+
/**
* <p>Title: HDF Text content extractor file</p>
@@ -31,8 +20,9 @@
*/
public class HDFExtractor implements FileExtractor {
- private static final String rtfMagicCode = "{\\rtf";
- private static final int BUF_LEN = 100;
+
+ private static org.apache.log4j.Logger logger =
+ org.apache.log4j.Logger.getLogger (HDFExtractor.class);
public HDFExtractor() {
}
@@ -48,12 +38,7 @@
*/
public String getContentAsString(String path, long lastModified,
InputStream fileStream)
throws IOException {
- BufferedInputStream bufStream = new BufferedInputStream(fileStream,
BUF_LEN);
- if (isFileRTF(bufStream)) {
- throw new IOException("File is in RTF format. Cannot process.");
- } else {
- return getHDFText(bufStream, false, false);
- }
+ return getContentAsString(path, lastModified, fileStream, null);
}
/**
@@ -69,280 +54,17 @@
public String getContentAsString(String path, long lastModified,
InputStream fileStream, String charSet)
throws IOException {
- return getContentAsString(path, lastModified, fileStream);
-
- }
-
- private String getHDFText(InputStream fileStream,
- boolean displayUnknownChars,
- boolean displayControlChars) throws
- IOException {
- WordDocument wordDocument = new WordDocument(fileStream);
- StringWriter sw = new StringWriter();
- wordDocument.writeAllText(sw);
- String result = sw.toString();
- StringWriter writer = new StringWriter();
- int lineLengthCounter = 0;
- for (int i = 0; i < result.length(); i++) {
- char ch = result.charAt(i);
- if (ch >= 32) {
- writer.write(ch);
- lineLengthCounter++;
- } else {
- switch (ch) {
- case 9:
- writer.write(ch);
- lineLengthCounter += 3;
- break;
- case 11:
- if (displayControlChars) {
- writer.write("<br>");
- lineLengthCounter += "<br>".length();
- } else {
- writer.write(System.getProperty(
- "line.separator"));
- lineLengthCounter = 0;
- }
- break;
- case 13:
- writer.write(System.getProperty("line.separator"));
- lineLengthCounter = 0;
- break;
- case 14:
- if (displayControlChars) {
- writer.write("COLUMNBREAK");
- lineLengthCounter += "COLUMNBREAK".length();
- } else {
- writer.write(" ");
- lineLengthCounter++;
- }
- break;
- case 19:
- if (displayControlChars) {
- writer.write("FIELDBEGIN");
- lineLengthCounter += "FIELDBEGIN".length();
- } else {
- writer.write(" ");
- lineLengthCounter++;
- }
- break;
- case 20:
- if (displayControlChars) {
- writer.write("FIELDSEP");
- lineLengthCounter += "FIELDSEP".length();
- } else {
- writer.write(" ");
- lineLengthCounter++;
- }
- break;
- case 21:
- if (displayControlChars) {
- writer.write("FIELDEND");
- lineLengthCounter += "FIELDEND".length();
- } else {
- writer.write(" ");
- lineLengthCounter++;
- }
- break;
- case 160:
- if (displayControlChars) {
- writer.write(" ");
- lineLengthCounter += " ".length();
- } else {
- writer.write(" ");
- lineLengthCounter++;
- }
- break;
- default:
- if (displayUnknownChars) {
- writer.write("0x" + Integer.toHexString(ch));
- lineLengthCounter +=
- ( (String) "0x" + Integer.toHexString(ch)).
- length();
- }
- }
- }
- /*
- if (lineLengthCounter > 75) {
- writer.write(System.getProperty("line.separator"));
- lineLengthCounter = 0;
- }
- */
- }
- return writer.toString();
- }
-
- private boolean displayHDFFile(String fileName, boolean
displayUnknownChars,
- boolean displayControlChars) {
- // displayFileSystem(fileName);
- boolean successful = true;
+ MSWordExtractor ex = new MSWordExtractor("","","");
try {
- FileInputStream fileStream = new FileInputStream(fileName);
- String outputStr = getHDFText(fileStream, displayUnknownChars,
displayControlChars);
- if (outputStr.length() > 0) {
- FileWriter fileWriter = new FileWriter(fileName + ".txt");
- fileWriter.write(outputStr);
- fileWriter.flush();
- fileWriter.close();
- } else {
- System.out.println("Problem during text extraction, content
for file "+fileName+ " is empty.");
- successful = false;
- }
- } catch (IOException ioe) {
- System.out.println("Error in file " + fileName + " : ");
- ioe.printStackTrace(System.out);
- successful = false;
+ Reader reader = ex.extract(fileStream);
+ return FileUtils.readerToString(reader);
+ } catch ( Throwable t ){
+ logger.debug("Exception extraction file " + path, t);
+ throw new IOException(t.getMessage());
}
- return successful;
- }
- private void displayFileSystem(String fileName) throws
NullPointerException {
- try {
- InputStream inputStream = new FileInputStream(fileName);
- POIFSFileSystem fs;
- fs = new POIFSFileSystem(inputStream);
- DirectoryEntry root = fs.getRoot();
-
- // dir is an instance of DirectoryEntry ...
- for (Iterator iter = root.getEntries(); iter.hasNext(); ) {
- Entry entry = (Entry) iter.next();
- System.out.print("found entry: " + entry.getName());
- if (entry instanceof DirectoryEntry) {
- // .. recurse into this directory
- System.out.println(",type=directory");
- } else if (entry instanceof DocumentEntry) {
- // entry is a document, which you can read
- DocumentEntry docEntry = (DocumentEntry) entry;
- System.out.println(",type=document,size=" +
- docEntry.getSize() + " bytes");
- if (entry.getName().equalsIgnoreCase("WordDocument")) {
- DocumentInputStream docStream = new
DocumentInputStream(
- docEntry);
- byte[] content = new byte[docStream.available()];
- docStream.read(content);
- docStream.close();
- int lineLengthCounter = 0;
- char previousChar = 0;
- for (int i = 0; i < content.length; i++) {
- if (content[i] >= 32 && content[i] < 127) {
- System.out.print( (char) content[i]);
- } else {
- int byteInt = content[i] & 0xFF;
- System.out.print("0x" +
-
Integer.toHexString(byteInt));
- lineLengthCounter++;
- lineLengthCounter++;
- lineLengthCounter++;
- }
- lineLengthCounter++;
- if (lineLengthCounter > 75) {
- System.out.println("");
- lineLengthCounter = 0;
- }
- previousChar = (char) content[i];
- }
- System.out.println("");
- }
- } else {
- System.out.println(",type=other");
- // currently, either an Entry is a DirectoryEntry or a
DocumentEntry,
- // but in the future, there may be other entry
subinterfaces. The
- // internal data structure certainly allows for a lot
more entry types.
- }
- }
-
- } catch (FileNotFoundException fnfe) {
- fnfe.printStackTrace();
- } catch (IOException e) {
- // an I/O error occurred, or the InputStream did not provide a
compatible
- // POIFS data structure
- e.printStackTrace();
- }
}
- class DocFileFilter
- implements FilenameFilter {
- public boolean accept(File dir,
- String name) {
- if (name.toLowerCase().endsWith(".doc")) {
- return true;
- } else {
- return false;
- }
- }
- }
- private int processHDFFilesInDir(String directoryName) {
- int errorCount = 0;
- File currentDir = new File(directoryName);
- if (currentDir.isDirectory()) {
- DocFileFilter docFilter = new DocFileFilter();
- File[] docFiles = currentDir.listFiles(docFilter);
- System.out.println("Processing "+docFiles.length+" *.doc files
in " +
- currentDir.toString() + "...");
- for (int i = 0; i < docFiles.length; i++) {
- // System.out.println("Dumping text of " +
docFiles[i].toString());
- FileInputStream fileStream = null;
- try {
- fileStream = new FileInputStream(docFiles[i]);
- } catch (FileNotFoundException e) {
- continue;
- }
- if (!isFileRTF(fileStream)) {
- boolean successful =
displayHDFFile(docFiles[i].toString(), false, false);
- if (!successful) {
- errorCount++;
- }
- } else {
- System.out.println("File " + docFiles[i] + " is an RTF
file, skipping...");
- }
- }
- if (errorCount != 0) {
- double errorRateDouble = (100.0 * errorCount) /
docFiles.length;
- int errorRate = new Double(errorRateDouble).intValue();
- System.out.println("Error rate: " + errorCount +
"/"+docFiles.length+ " files ("+ errorRate+"%) ");
- }
- } else {
- System.out.println(directoryName +
- " is not a valid directory. Aborting.");
- errorCount++;
- }
- return errorCount;
- }
-
- private boolean isFileRTF(InputStream fileStream) {
-
- boolean foundMarker = false;
-
- fileStream.mark(100);
-
- try {
- byte[] magicBuf = new byte[5];
-
- fileStream.read(magicBuf);
- String readStr = new String(magicBuf, "iso-8859-1");
- if (readStr.equals(rtfMagicCode)) {
- foundMarker = true;
- }
-
- fileStream.reset();
-
- } catch (FileNotFoundException fnfe) {
- fnfe.printStackTrace();
- } catch (IOException ioe) {
- ioe.printStackTrace();
- }
- return foundMarker;
- }
-
- public static void main(String[] args) {
- HDFExtractor extractor = new HDFExtractor();
- int errors = 0;
- if (args.length == 0) {
- errors =
extractor.processHDFFilesInDir(System.getProperty("user.dir"));
- } else {
- errors = extractor.processHDFFilesInDir(args[0]);
- }
- }
}
Index: PDFExtractor.java
===================================================================
RCS file:
/home/cvs/repository/jahia/src/java/org/jahia/utils/fileparsers/Attic/PDFExtractor.java,v
retrieving revision 1.3.2.9
retrieving revision 1.3.2.10
diff -u -r1.3.2.9 -r1.3.2.10
--- PDFExtractor.java 10 Dec 2004 15:01:38 -0000 1.3.2.9
+++ PDFExtractor.java 8 Apr 2005 14:38:58 -0000 1.3.2.10
@@ -108,7 +108,12 @@
}
//create a tmp output stream with the size of the content.
ByteArrayOutputStream out = new ByteArrayOutputStream();
- OutputStreamWriter writer = new OutputStreamWriter(out);
+ OutputStreamWriter writer = null;
+ if ( charSet != null ) {
+ writer = new OutputStreamWriter(out,charSet);
+ } else {
+ writer = new OutputStreamWriter(out);
+ }
PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(pdfDocument, writer);
writer.close();