fileparsers HDFExtractor.java PDFExtractor.java

Khue Nguyen Fri, 08 Apr 2005 07:39:01 -0700

knguyen     2005/04/08 16:38:59 CEST


  Modified files:        (Branch: JAHIA-4-0-BRANCH)
    src/java/org/jahia/utils/fileparsers HDFExtractor.java 
                                         PDFExtractor.java 
  Log:
  - fix pdf and charset parsing
  
  Revision  Changes    Path
  1.4.4.3   +14 -292   
jahia/src/java/org/jahia/utils/fileparsers/HDFExtractor.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/src/java/org/jahia/utils/fileparsers/HDFExtractor.java.diff?r1=1.4.4.2&r2=1.4.4.3&f=h
  1.3.2.10  +6 -1      
jahia/src/java/org/jahia/utils/fileparsers/PDFExtractor.java
http://jahia.mine.nu:8080/cgi-bin/cvsweb.cgi/jahia/src/java/org/jahia/utils/fileparsers/PDFExtractor.java.diff?r1=1.3.2.9&r2=1.3.2.10&f=h
  
  
  
  Index: HDFExtractor.java
  ===================================================================
  RCS file: 
/home/cvs/repository/jahia/src/java/org/jahia/utils/fileparsers/Attic/HDFExtractor.java,v
  retrieving revision 1.4.4.2
  retrieving revision 1.4.4.3
  diff -u -r1.4.4.2 -r1.4.4.3
  --- HDFExtractor.java 20 Oct 2004 09:46:09 -0000      1.4.4.2
  +++ HDFExtractor.java 8 Apr 2005 14:38:58 -0000       1.4.4.3
  @@ -1,22 +1,11 @@
   package org.jahia.utils.fileparsers;
   
  -import java.io.File;
  -import java.io.FileInputStream;
  -import java.io.FileNotFoundException;
  -import java.io.FileWriter;
  -import java.io.FilenameFilter;
  -import java.io.IOException;
  -import java.io.InputStream;
  -import java.io.StringWriter;
  +import java.io.*;
   import java.util.Iterator;
   
  -import org.apache.poi.hdf.extractor.WordDocument;
  -import org.apache.poi.poifs.filesystem.DirectoryEntry;
  -import org.apache.poi.poifs.filesystem.DocumentEntry;
  -import org.apache.poi.poifs.filesystem.DocumentInputStream;
  -import org.apache.poi.poifs.filesystem.Entry;
  -import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  -import java.io.BufferedInputStream;
  +import org.apache.slide.extractor.MSWordExtractor;
  +import org.jahia.utils.FileUtils;
  +
   
   /**
    * <p>Title: HDF Text content extractor file</p>
  @@ -31,8 +20,9 @@
    */
   
   public class HDFExtractor implements FileExtractor {
  -    private static final String rtfMagicCode = "{\\rtf";
  -    private static final int BUF_LEN = 100;
  +
  +    private static org.apache.log4j.Logger logger =
  +        org.apache.log4j.Logger.getLogger (HDFExtractor.class);
   
       public HDFExtractor() {
       }
  @@ -48,12 +38,7 @@
        */
       public String getContentAsString(String path, long lastModified, 
InputStream fileStream)
       throws IOException {
  -        BufferedInputStream bufStream = new BufferedInputStream(fileStream, 
BUF_LEN);
  -        if (isFileRTF(bufStream)) {
  -            throw new IOException("File is in RTF format. Cannot process.");
  -        } else {
  -            return getHDFText(bufStream, false, false);
  -        }
  +        return  getContentAsString(path, lastModified, fileStream, null);
       }
   
       /**
  @@ -69,280 +54,17 @@
       public String getContentAsString(String path, long lastModified,
                                        InputStream fileStream, String charSet)
       throws IOException {
  -      return getContentAsString(path, lastModified, fileStream);
  -
  -    }
  -
  -    private String getHDFText(InputStream fileStream,
  -                                        boolean displayUnknownChars,
  -                                        boolean displayControlChars) throws
  -        IOException {
  -        WordDocument wordDocument = new WordDocument(fileStream);
  -        StringWriter sw = new StringWriter();
  -        wordDocument.writeAllText(sw);
  -        String result = sw.toString();
  -        StringWriter writer = new StringWriter();
  -        int lineLengthCounter = 0;
  -        for (int i = 0; i < result.length(); i++) {
  -            char ch = result.charAt(i);
  -            if (ch >= 32) {
  -                writer.write(ch);
  -                lineLengthCounter++;
  -            } else {
  -                switch (ch) {
  -                    case 9:
  -                        writer.write(ch);
  -                        lineLengthCounter += 3;
  -                        break;
  -                    case 11:
  -                        if (displayControlChars) {
  -                            writer.write("<br>");
  -                            lineLengthCounter += "<br>".length();
  -                        } else {
  -                            writer.write(System.getProperty(
  -                                "line.separator"));
  -                            lineLengthCounter = 0;
  -                        }
  -                        break;
  -                    case 13:
  -                        writer.write(System.getProperty("line.separator"));
  -                        lineLengthCounter = 0;
  -                        break;
  -                    case 14:
  -                        if (displayControlChars) {
  -                            writer.write("COLUMNBREAK");
  -                            lineLengthCounter += "COLUMNBREAK".length();
  -                        } else {
  -                            writer.write(" ");
  -                            lineLengthCounter++;
  -                        }
  -                        break;
  -                    case 19:
  -                        if (displayControlChars) {
  -                            writer.write("FIELDBEGIN");
  -                            lineLengthCounter += "FIELDBEGIN".length();
  -                        } else {
  -                            writer.write(" ");
  -                            lineLengthCounter++;
  -                        }
  -                        break;
  -                    case 20:
  -                        if (displayControlChars) {
  -                            writer.write("FIELDSEP");
  -                            lineLengthCounter += "FIELDSEP".length();
  -                        } else {
  -                            writer.write(" ");
  -                            lineLengthCounter++;
  -                        }
  -                        break;
  -                    case 21:
  -                        if (displayControlChars) {
  -                            writer.write("FIELDEND");
  -                            lineLengthCounter += "FIELDEND".length();
  -                        } else {
  -                            writer.write(" ");
  -                            lineLengthCounter++;
  -                        }
  -                        break;
  -                    case 160:
  -                        if (displayControlChars) {
  -                            writer.write("&nbsp;");
  -                            lineLengthCounter += "&nbsp;".length();
  -                        } else {
  -                            writer.write(" ");
  -                            lineLengthCounter++;
  -                        }
  -                        break;
  -                    default:
  -                        if (displayUnknownChars) {
  -                            writer.write("0x" + Integer.toHexString(ch));
  -                            lineLengthCounter +=
  -                                ( (String) "0x" + Integer.toHexString(ch)).
  -                                length();
  -                        }
  -                }
  -            }
  -            /*
  -                             if (lineLengthCounter > 75) {
  -                writer.write(System.getProperty("line.separator"));
  -                lineLengthCounter = 0;
  -                             }
  -             */
  -        }
  -        return writer.toString();
  -    }
  -
  -    private boolean displayHDFFile(String fileName, boolean 
displayUnknownChars,
  -                                 boolean displayControlChars) {
  -        // displayFileSystem(fileName);
  -        boolean successful = true;
  +        MSWordExtractor ex = new MSWordExtractor("","","");
           try {
  -            FileInputStream fileStream = new FileInputStream(fileName);
  -            String outputStr = getHDFText(fileStream, displayUnknownChars, 
displayControlChars);
  -            if (outputStr.length() > 0) {
  -                FileWriter fileWriter = new FileWriter(fileName + ".txt");
  -                fileWriter.write(outputStr);
  -                fileWriter.flush();
  -                fileWriter.close();
  -            } else {
  -                System.out.println("Problem during text extraction, content 
for file "+fileName+ " is empty.");
  -                successful = false;
  -            }
  -        } catch (IOException ioe) {
  -            System.out.println("Error in file " + fileName + " : ");
  -            ioe.printStackTrace(System.out);
  -            successful = false;
  +            Reader reader = ex.extract(fileStream);
  +            return FileUtils.readerToString(reader);
  +        } catch ( Throwable t ){
  +            logger.debug("Exception extraction file " + path, t);
  +            throw new IOException(t.getMessage());
           }
  -        return successful;
  -    }
   
  -    private void displayFileSystem(String fileName) throws 
NullPointerException {
  -        try {
  -            InputStream inputStream = new FileInputStream(fileName);
  -            POIFSFileSystem fs;
  -            fs = new POIFSFileSystem(inputStream);
  -            DirectoryEntry root = fs.getRoot();
  -
  -            // dir is an instance of DirectoryEntry ...
  -            for (Iterator iter = root.getEntries(); iter.hasNext(); ) {
  -                Entry entry = (Entry) iter.next();
  -                System.out.print("found entry: " + entry.getName());
  -                if (entry instanceof DirectoryEntry) {
  -                    // .. recurse into this directory
  -                    System.out.println(",type=directory");
  -                } else if (entry instanceof DocumentEntry) {
  -                    // entry is a document, which you can read
  -                    DocumentEntry docEntry = (DocumentEntry) entry;
  -                    System.out.println(",type=document,size=" +
  -                                       docEntry.getSize() + " bytes");
  -                    if (entry.getName().equalsIgnoreCase("WordDocument")) {
  -                        DocumentInputStream docStream = new 
DocumentInputStream(
  -                            docEntry);
  -                        byte[] content = new byte[docStream.available()];
  -                        docStream.read(content);
  -                        docStream.close();
  -                        int lineLengthCounter = 0;
  -                        char previousChar = 0;
  -                        for (int i = 0; i < content.length; i++) {
  -                            if (content[i] >= 32 && content[i] < 127) {
  -                                System.out.print( (char) content[i]);
  -                            } else {
  -                                int byteInt = content[i] & 0xFF;
  -                                System.out.print("0x" +
  -                                                 
Integer.toHexString(byteInt));
  -                                lineLengthCounter++;
  -                                lineLengthCounter++;
  -                                lineLengthCounter++;
  -                            }
  -                            lineLengthCounter++;
  -                            if (lineLengthCounter > 75) {
  -                                System.out.println("");
  -                                lineLengthCounter = 0;
  -                            }
  -                            previousChar = (char) content[i];
  -                        }
  -                        System.out.println("");
  -                    }
  -                } else {
  -                    System.out.println(",type=other");
  -                    // currently, either an Entry is a DirectoryEntry or a 
DocumentEntry,
  -                    // but in the future, there may be other entry 
subinterfaces. The
  -                    // internal data structure certainly allows for a lot 
more entry types.
  -                }
  -            }
  -
  -        } catch (FileNotFoundException fnfe) {
  -            fnfe.printStackTrace();
  -        } catch (IOException e) {
  -            // an I/O error occurred, or the InputStream did not provide a 
compatible
  -            // POIFS data structure
  -            e.printStackTrace();
  -        }
       }
   
  -    class DocFileFilter
  -        implements FilenameFilter {
  -        public boolean accept(File dir,
  -                              String name) {
  -            if (name.toLowerCase().endsWith(".doc")) {
  -                return true;
  -            } else {
  -                return false;
  -            }
  -        }
  -    }
   
  -    private int processHDFFilesInDir(String directoryName) {
  -        int errorCount = 0;
  -        File currentDir = new File(directoryName);
  -        if (currentDir.isDirectory()) {
  -            DocFileFilter docFilter = new DocFileFilter();
  -            File[] docFiles = currentDir.listFiles(docFilter);
  -            System.out.println("Processing "+docFiles.length+" *.doc files 
in " +
  -                               currentDir.toString() + "...");
  -            for (int i = 0; i < docFiles.length; i++) {
  -                // System.out.println("Dumping text of " + 
docFiles[i].toString());
  -                FileInputStream fileStream = null;
  -                try {
  -                    fileStream = new FileInputStream(docFiles[i]);
  -                } catch (FileNotFoundException e) {
  -                    continue;
  -                }
  -                if (!isFileRTF(fileStream)) {
  -                    boolean successful = 
displayHDFFile(docFiles[i].toString(), false, false);
  -                    if (!successful) {
  -                        errorCount++;
  -                    }
  -                } else {
  -                    System.out.println("File " + docFiles[i] + " is an RTF 
file, skipping...");
  -                }
  -            }
  -            if (errorCount != 0) {
  -                double errorRateDouble = (100.0 * errorCount) / 
docFiles.length;
  -                int errorRate = new Double(errorRateDouble).intValue();
  -                System.out.println("Error rate: " + errorCount + 
"/"+docFiles.length+ " files ("+ errorRate+"%) ");
  -            }
  -        } else {
  -            System.out.println(directoryName +
  -                               " is not a valid directory. Aborting.");
  -            errorCount++;
  -        }
  -        return errorCount;
  -    }
  -
  -    private boolean isFileRTF(InputStream fileStream) {
  -
  -        boolean foundMarker = false;
  -
  -        fileStream.mark(100);
  -
  -        try {
  -            byte[] magicBuf = new byte[5];
  -
  -            fileStream.read(magicBuf);
  -            String readStr = new String(magicBuf, "iso-8859-1");
  -            if (readStr.equals(rtfMagicCode)) {
  -                foundMarker = true;
  -            }
  -
  -            fileStream.reset();
  -
  -        } catch (FileNotFoundException fnfe) {
  -            fnfe.printStackTrace();
  -        } catch (IOException ioe) {
  -            ioe.printStackTrace();
  -        }
  -        return foundMarker;
  -    }
  -
  -    public static void main(String[] args) {
  -        HDFExtractor extractor = new HDFExtractor();
  -        int errors = 0;
  -        if (args.length == 0) {
  -            errors = 
extractor.processHDFFilesInDir(System.getProperty("user.dir"));
  -        } else {
  -            errors = extractor.processHDFFilesInDir(args[0]);
  -        }
  -    }
   
   }
  
  
  
  Index: PDFExtractor.java
  ===================================================================
  RCS file: 
/home/cvs/repository/jahia/src/java/org/jahia/utils/fileparsers/Attic/PDFExtractor.java,v
  retrieving revision 1.3.2.9
  retrieving revision 1.3.2.10
  diff -u -r1.3.2.9 -r1.3.2.10
  --- PDFExtractor.java 10 Dec 2004 15:01:38 -0000      1.3.2.9
  +++ PDFExtractor.java 8 Apr 2005 14:38:58 -0000       1.3.2.10
  @@ -108,7 +108,12 @@
               }
               //create a tmp output stream with the size of the content.
               ByteArrayOutputStream out = new ByteArrayOutputStream();
  -            OutputStreamWriter writer = new OutputStreamWriter(out);
  +            OutputStreamWriter writer = null;
  +            if ( charSet != null ) {
  +                writer = new OutputStreamWriter(out,charSet);
  +            } else {
  +                writer = new OutputStreamWriter(out);
  +            }
               PDFTextStripper stripper = new PDFTextStripper();
               stripper.writeText(pdfDocument, writer);
               writer.close();

cvs commit: jahia/src/java/org/jahia/utils/fileparsers HDFExtractor.java PDFExtractor.java

Reply via email to