Index MSOffice Documents

Sergiu Gordea Fri, 25 Jun 2004 05:42:18 -0700

Hi all,

I'm working on a project in which we are building a knowledge management platform. We are using Turbine/Velocity as framework and we are using lucene for search.

We want to make the search to be able to index MSOffice Documents, therefore I was searching for some possibilities to extract the text from this documents. I found some examples based on POI library (http://jakarta.apache.org/poi) and I addapted them to our needs. The extraction of the text elements from XLS file I think is trustable (the from POI development comunity did a great job with the package that work with XSL files). The examples that extract the text from DOC and PPT files are not very general, I think they have problems with the documents written with special charsets but they are working just well on the documents I use. I hope someone that has more experience that I have will improve this and will a better source code.

Congratulations to all people involved in development of the Jakarta project and it's subprojects,

Sergiu Gordea

Ps: ExeConverteImpl uses an external stand alone application (like antiwort or pdf2txt) to extract the text.

/* @(#) CWK 1.4 07.06.2004
 * 
 * Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
 * Universit�tsstr. 94/7 9020 Klagenfurt Austria
 * www.configworks.com
 * All rights reserved.
 */


package com.configworks.cwk.be.search.converters;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;

/**
 * Class description
 *
 * @author sergiu
 * @version 1.0
 * @since CWK 1.5
 */
public class XLSConverterImpl extends JavaDocumentConverter {

    private Log logger = null;
    File dest = null;

    

    public boolean extractText(InputStream reader, BufferedWriter writer) throws 
FileNotFoundException,
        IOException {

        HSSFWorkbook workbook = new HSSFWorkbook(reader);
        
        for (int k = 0; k < workbook.getNumberOfSheets(); k++) {
            HSSFSheet sheet = workbook.getSheetAt(k);
            
            if (sheet != null) {
                int rows = sheet.getLastRowNum();
                //I don't know why the last row = sheet.getRow(rows) and first row = 
sheet.getRow(0) 
                for (int r = 0; r <= rows; r++) {
                        HSSFRow row = sheet.getRow(r);
                        if (row != null) {
                            int cells = row.getLastCellNum();
                            for (int c = 0; c <= cells; c++) {
                                    HSSFCell cell = row.getCell((short) c);
                                    String value = null;
                                    if (cell != null) {
                                        switch (cell.getCellType()) {
                                            case HSSFCell.CELL_TYPE_FORMULA:
                                                value = cell.getCellFormula();
                                                break;
                                            case HSSFCell.CELL_TYPE_STRING:
                                                value = cell.getStringCellValue();
                                                break;
                                            case HSSFCell.CELL_TYPE_NUMERIC:
                                                value = "" + 
cell.getNumericCellValue();
                                                break;
                                            default:
                                                value = cell.getStringCellValue();
                                        }
                                    }
                                    if (value != null) {
                                        writer.write(value + " ");
                                    }
                                }//cels
                        }
                    }//rows
            }
        }//sheets
        
        //if no Exception was thrown consider that the conversion was successful 
        return true;
    }

    /**
     * @return Returns the logger.
     */
    public Log getLogger() {
        if (logger == null)
            logger = LogFactory.getLog(XLSConverterImpl.class);
        return logger;
    }

}

package com.configworks.cwk.be.search.converters;

import com.configworks.cwk.share.Utils;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;


/**
 * Created by IntelliJ IDEA.
 * User: Kostya
 * Date: 12.09.2003
 * Time: 11:39:25
 * To change this template use Options | File Templates.
 */

public class ExeConverterImpl extends Converter {

    private Log logger = LogFactory.getLog(ExeConverterImpl.class.getName());

    public Reader convertSource(File source) {
        try {
            // the type is not registered the file content will not be added to the 
index
            if (_config.getExecutionPath() == null) {
                return null;
            }
            // else convert file into a temp dir and return contents of the converted 
file
            else {
                // if no converter is specified the file will be added withot 
conversion
                if (_config.getExecutionPath().length() == 0)
                    return new FileReader(source);

                String execPath = _config.getExecutionPath();

                String sourcePath = source.getAbsolutePath();
                // create tempdir if it doesn't exists
                new File(_config.getTempDirectory()).mkdirs();

                String targetPath = _config.getTempDirectory() + File.separator + 
source.getName()
                    + ".txt";
                
                String params = "";
                if(_config.getPathParam()!= null){
                        //add HOME parameter
                        params += _config.getPathParam();
                }

                Process process = Utils.executeOSCommand(execPath, sourcePath, 
targetPath, params);
                process.waitFor();
                if (logger.isTraceEnabled()) {
                    BufferedInputStream stream=null;
                    try {
                            stream = new BufferedInputStream(process.getErrorStream());
                            int read = 0;
                            String outErrorString = "";
                            while ((read = stream.read()) > 0)
                                outErrorString += ((char) read);
                            stream.close();
                            if (outErrorString.length() > 0)
                                logger.error(outErrorString);
                    } finally {
                        if (stream!=null) {
                            stream.close();
                        }
                    }
                }
                File convertedSource = new File(targetPath);
                convertedSource.deleteOnExit();
                return new FileReader(convertedSource);
            }
        } catch (IOException ex) {
            if (logger.isErrorEnabled())
                logger.error("IOException: " + ex.getMessage());
        } catch (InterruptedException ex) {
            if (logger.isErrorEnabled())
                logger.error("InterruptedException: " + ex.getMessage());
        }
            
        return null;
    }
}

/* @(#) CWK 1.4 25.06.2004
 * 
 * Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
 * Universit�tsstr. 94/7 9020 Klagenfurt Austria
 * www.configworks.com
 * All rights reserved.
 */

package com.configworks.cwk.be.search.converters;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;

/**
 * Class description 
 *  
 * @author sergiu
 * @version 1.0
 * 
 * @since CWK 1.5
 */
public abstract class JavaDocumentConverter extends Converter {

        File dest = null;
        
        /* (non-Javadoc)
         * @see 
com.configworks.cwk.be.search.converters.Converter#convertSource(java.io.File)
         */
        public Reader convertSource(File source) {
                if (source == null)
                        return null;
                Reader reader = null;
                InputStream inputStream = null;
                BufferedWriter writer = null;
                try {
                        String filename = source.getName();
                        filename = filename.replace('.', '_');
                        filename += ".txt";
                        File tmpDir = new File(_config.getTempDirectory());
                        tmpDir.mkdirs();
                        dest = new File(tmpDir.getPath(), filename);
                        boolean created = dest.createNewFile();            
                        
                        //create the input and output streams 
                        writer = new BufferedWriter( 
                                        new FileWriter(dest));
                        inputStream = new FileInputStream(source);
                        
                        extractText(inputStream, writer);
                        
                        if (!dest.exists())
                                return null;
                        dest.deleteOnExit();
                        reader = new BufferedReader(new FileReader(dest));

                } catch (Exception e) {
                        getLogger().error("JavaDocumentConverter cannot convert the 
source file: "
                                        + source.getAbsolutePath(), e);
                        reader = null;
                }finally{
                        try {
                                if(writer != null)
                                        writer.close();
                                if(inputStream != null)
                                        inputStream.close();
                        } catch (IOException ex) {
                                if(getLogger().isDebugEnabled())
                                        getLogger().error("Cannot close the stream: " 
+ ex);
                        }
                }
                return reader;

        }

        /**
         * @param inputStream
         * @param writer
         * @since CWK 1.4.1
         * @see 
         */
        public abstract boolean extractText(InputStream inputStream, BufferedWriter 
writer) throws IOException; 

}

/* @(#) CWK 1.5 23.06.2004
 * 
 * Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
 * Universit�tsstr. 94/7 9020 Klagenfurt Austria
 * www.configworks.com
 * All rights reserved.
 */

package com.configworks.cwk.be.search.converters;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.util.LittleEndian;

/**
 * Class description
 *
 * @author sergiu
 * @version 1.0
 * @since CWK 1.5
 */
public class PPTConverterImpl extends JavaDocumentConverter {

    static final String lineSeparator = System.getProperty("line.separator");
   
    /**
     * Extract the text from a number of presentations.
     */
    public boolean extractText(InputStream  reader, BufferedWriter writer) throws 
IOException{
                POIFSReader r = new POIFSReader();

                /* Register a listener for *all* documents. */
                MyPOIFSReaderListener listener = new MyPOIFSReaderListener(writer);
                r.registerListener(listener);
                r.read(reader);
                //if no exception was trown, consider that the conversion was 
successful  
                return true;
    }   
    
    class MyPOIFSReaderListener implements POIFSReaderListener{
        private BufferedWriter writer = null;
        
        public MyPOIFSReaderListener(BufferedWriter writer){
                this.writer = writer;
        }

        public void processPOIFSReaderEvent(POIFSReaderEvent event) {
                PropertySet ps = null;

                try{
                        
                        org.apache.poi.poifs.filesystem.DocumentInputStream dis=null;
                        if(!event.getName().equalsIgnoreCase("PowerPoint Document"))
                                return;
                        
                        dis=event.getStream();
                        
                        byte btoWrite[]= new byte[12];
                        dis.read(btoWrite);
                        
                        btoWrite = new byte[dis.available()];
                        dis.read(btoWrite, 0, dis.available());
                        
                        //StringBuffer buff = new StringBuffer("");
                        
                        for(int i=0; i<btoWrite.length-20; i++){

                                long type=LittleEndian.getUShort(btoWrite,i+2);
                                long size=LittleEndian.getUInt(btoWrite,i+4);
                                if (type==4008){
                                        
                                        int offset = i+4+1;
                                        int length = (int)size+3;
                                        int end = offset + length;
                                        
                                        byte[] textBytes = new byte[length]; 
                                        
                                        for (int j = offset; j < end; j++) {
                                                byte b = btoWrite[j];
                                                writer.write((char) b);
                                        }

                                        if(i < (end -1))
                                                i = end -1;
                                }
                                
                        }
                        
                        PropertySetFactory.create(event.getStream());
                }catch (Exception e){
                        String msg = "Cannot index ppt file: \n";
                    if(getLogger().isErrorEnabled())
                        getLogger().error(msg + e);
                }       
        }       
    }
}

/* @(#) CWK 1.4 24.06.2004
 * 
 * Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
 * Universit�tsstr. 94/7 9020 Klagenfurt Austria
 * www.configworks.com
 * All rights reserved.
 */

package com.configworks.cwk.be.search.converters;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;

/**
 * Class description
 *
 * @author sergiu
 * @version 1.0
 * @since CWK 1.5
 */
public class WordConverterImpl extends JavaDocumentConverter {

    public boolean extractText(InputStream in, BufferedWriter writer) throws 
IOException{
        ArrayList text = new ArrayList();
        POIFSFileSystem fsys = new POIFSFileSystem(in);

        DocumentEntry headerProps =
        (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
        DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
        byte[] header = new byte[headerProps.getSize()];

        din.read(header);
        din.close();
        // Prende le informazioni dall'header del documento
        int info = LittleEndian.getShort(header, 0xa);

        boolean useTable1 = (info & 0x200) != 0;

        // Prende informazioni dalla piece table
        int complexOffset = LittleEndian.getInt(header, 0x1a2);


        String tableName = null;
        if (useTable1)
                tableName = "1Table";
        else
                tableName = "0Table";
        
        DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
        byte[] tableStream = new byte[table.getSize()];

        din = fsys.createDocumentInputStream(tableName);

        din.read(tableStream);
        din.close();

        din = null;
        fsys = null;
        table = null;
        headerProps = null;

        int multiple = findText(tableStream, complexOffset, text);

        StringBuffer sb = new StringBuffer();
        int size = text.size();
        tableStream = null;

        for (int x = 0; x < size; x++){
                WordTextPiece nextPiece = (WordTextPiece)text.get(x);
                int start = nextPiece.getStart();
                int length = nextPiece.getLength();

                boolean unicode = nextPiece.usesUnicode();
                String toStr = null;
                if (unicode)
                        toStr = new String(header, start, length * multiple, 
"UTF-16LE");
                else
                        toStr = new String(header, start, length , "ISO-8859-1");
                
                //sb.append(toStr).append(" ");
                toStr += " ";
                writer.write(toStr);
        }
        //if no exeption occured we say that the conversion was successfully realized 
        return true;
    }

    private static int findText(byte[] tableStream, int complexOffset,
                                                                ArrayList text) throws 
IOException{
        //actual text
        int pos = complexOffset;
        int multiple = 2;
        //skips through the prms before we reach the piece table. These contain data
        //for actual fast saved files
        while(tableStream[pos] == 1){
                pos++;
                int skip = LittleEndian.getShort(tableStream, pos);
                pos += 2 + skip;
        }
        
        if(tableStream[pos] != 2){
                throw new IOException("corrupted Word file");
        }else{
                //parse out the text pieces
                int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
                pos += 4;
                int pieces = (pieceTableSize - 4) / 12;
                for (int x = 0; x < pieces; x++){
                        int filePos = LittleEndian.getInt(tableStream, pos + ((pieces 
                                        + 1) * 4) +
                                        (x * 8) + 2);
                        boolean unicode = false;
                        if ((filePos & 0x40000000) == 0){
                                unicode = true;
                        }else{
                                unicode = false;
                                multiple = 1;
                                filePos &= ~(0x40000000);//gives me FC in doc stream
                                filePos /= 2;
                        }
                        
                        int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) 
* 4)
                                                        - 
LittleEndian.getInt(tableStream, pos + (x * 4));

                        WordTextPiece piece = new WordTextPiece(filePos, totLength, 
unicode);
                        
                        text.add(piece);
                }
        }
        return multiple;
    }
    
    

}

/* @(#) CWK 1.4 07.06.2004
 * 
 * Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
 * Universit�tsstr. 94/7 9020 Klagenfurt Austria
 * www.configworks.com
 * All rights reserved.
 */

package com.configworks.cwk.be.search.converters;

/**
 * Class description 
 *  
 * @author sergiu
 * @version 1.0
 * 
 * @since CWK 1.4
 */
class WordTextPiece{
        private int _fcStart;
        private boolean _usesUnicode;
        private int _length;

        public WordTextPiece(int start, int length, boolean unicode){
                _usesUnicode = unicode;
                _length = length;
                _fcStart = start;
        }
        public boolean usesUnicode(){
                return _usesUnicode;
        }

        public int getStart(){
                return _fcStart;
        }
        public int getLength(){
                return _length;
        }

}

package com.configworks.cwk.be.search.converters;

import java.io.File;
import java.io.Reader;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * Created by IntelliJ IDEA.
 * User: Kostya
 * Date: 11.09.2003
 * Time: 19:24:56
 * To change this template use Options | File Templates.
 */

public abstract class Converter {
    protected ConverterConfig _config;
    private static Log logger = null;
    
    public abstract Reader convertSource(File source);

    protected void Initialize(ConverterConfig config) {
        _config = config;
    };
    
    /**
     * @return Returns the logger.
     */
    public Log getLogger() {
        if (logger == null)
                logger = LogFactory.getLog(XLSConverterImpl.class);
        return logger;
    }
}

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Index MSOffice Documents

Reply via email to