Hi all,
I'm working on a project in which we are building a knowledge management platform. We are using Turbine/Velocity
as framework and we are using lucene for search.
We want to make the search to be able to index MSOffice Documents, therefore I was searching for some possibilities to extract the text from this
documents. I found some examples based on POI library (http://jakarta.apache.org/poi) and I addapted them to our needs.
The extraction of the text elements from XLS file I think is trustable (the from POI development comunity did a great job with the package that
work with XSL files). The examples that extract the text from DOC and PPT files are not very general, I think they have problems with the documents
written with special charsets but they are working just well on the documents I use. I hope someone that has more experience that I have will improve this
and will a better source code.
Congratulations to all people involved in development of the Jakarta project and it's subprojects,
Sergiu Gordea
Ps: ExeConverteImpl uses an external stand alone application (like antiwort or pdf2txt) to extract the text.
/* @(#) CWK 1.4 07.06.2004 * * Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH * Universit�tsstr. 94/7 9020 Klagenfurt Austria * www.configworks.com * All rights reserved. */
package com.configworks.cwk.be.search.converters;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
/**
* Class description
*
* @author sergiu
* @version 1.0
* @since CWK 1.5
*/
public class XLSConverterImpl extends JavaDocumentConverter {
private Log logger = null;
File dest = null;
public boolean extractText(InputStream reader, BufferedWriter writer) throws
FileNotFoundException,
IOException {
HSSFWorkbook workbook = new HSSFWorkbook(reader);
for (int k = 0; k < workbook.getNumberOfSheets(); k++) {
HSSFSheet sheet = workbook.getSheetAt(k);
if (sheet != null) {
int rows = sheet.getLastRowNum();
//I don't know why the last row = sheet.getRow(rows) and first row =
sheet.getRow(0)
for (int r = 0; r <= rows; r++) {
HSSFRow row = sheet.getRow(r);
if (row != null) {
int cells = row.getLastCellNum();
for (int c = 0; c <= cells; c++) {
HSSFCell cell = row.getCell((short) c);
String value = null;
if (cell != null) {
switch (cell.getCellType()) {
case HSSFCell.CELL_TYPE_FORMULA:
value = cell.getCellFormula();
break;
case HSSFCell.CELL_TYPE_STRING:
value = cell.getStringCellValue();
break;
case HSSFCell.CELL_TYPE_NUMERIC:
value = "" +
cell.getNumericCellValue();
break;
default:
value = cell.getStringCellValue();
}
}
if (value != null) {
writer.write(value + " ");
}
}//cels
}
}//rows
}
}//sheets
//if no Exception was thrown consider that the conversion was successful
return true;
}
/**
* @return Returns the logger.
*/
public Log getLogger() {
if (logger == null)
logger = LogFactory.getLog(XLSConverterImpl.class);
return logger;
}
}
package com.configworks.cwk.be.search.converters;
import com.configworks.cwk.share.Utils;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* Created by IntelliJ IDEA.
* User: Kostya
* Date: 12.09.2003
* Time: 11:39:25
* To change this template use Options | File Templates.
*/
public class ExeConverterImpl extends Converter {
private Log logger = LogFactory.getLog(ExeConverterImpl.class.getName());
public Reader convertSource(File source) {
try {
// the type is not registered the file content will not be added to the
index
if (_config.getExecutionPath() == null) {
return null;
}
// else convert file into a temp dir and return contents of the converted
file
else {
// if no converter is specified the file will be added withot
conversion
if (_config.getExecutionPath().length() == 0)
return new FileReader(source);
String execPath = _config.getExecutionPath();
String sourcePath = source.getAbsolutePath();
// create tempdir if it doesn't exists
new File(_config.getTempDirectory()).mkdirs();
String targetPath = _config.getTempDirectory() + File.separator +
source.getName()
+ ".txt";
String params = "";
if(_config.getPathParam()!= null){
//add HOME parameter
params += _config.getPathParam();
}
Process process = Utils.executeOSCommand(execPath, sourcePath,
targetPath, params);
process.waitFor();
if (logger.isTraceEnabled()) {
BufferedInputStream stream=null;
try {
stream = new BufferedInputStream(process.getErrorStream());
int read = 0;
String outErrorString = "";
while ((read = stream.read()) > 0)
outErrorString += ((char) read);
stream.close();
if (outErrorString.length() > 0)
logger.error(outErrorString);
} finally {
if (stream!=null) {
stream.close();
}
}
}
File convertedSource = new File(targetPath);
convertedSource.deleteOnExit();
return new FileReader(convertedSource);
}
} catch (IOException ex) {
if (logger.isErrorEnabled())
logger.error("IOException: " + ex.getMessage());
} catch (InterruptedException ex) {
if (logger.isErrorEnabled())
logger.error("InterruptedException: " + ex.getMessage());
}
return null;
}
}
/* @(#) CWK 1.4 25.06.2004
*
* Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
* Universit�tsstr. 94/7 9020 Klagenfurt Austria
* www.configworks.com
* All rights reserved.
*/
package com.configworks.cwk.be.search.converters;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
/**
* Class description
*
* @author sergiu
* @version 1.0
*
* @since CWK 1.5
*/
public abstract class JavaDocumentConverter extends Converter {
File dest = null;
/* (non-Javadoc)
* @see
com.configworks.cwk.be.search.converters.Converter#convertSource(java.io.File)
*/
public Reader convertSource(File source) {
if (source == null)
return null;
Reader reader = null;
InputStream inputStream = null;
BufferedWriter writer = null;
try {
String filename = source.getName();
filename = filename.replace('.', '_');
filename += ".txt";
File tmpDir = new File(_config.getTempDirectory());
tmpDir.mkdirs();
dest = new File(tmpDir.getPath(), filename);
boolean created = dest.createNewFile();
//create the input and output streams
writer = new BufferedWriter(
new FileWriter(dest));
inputStream = new FileInputStream(source);
extractText(inputStream, writer);
if (!dest.exists())
return null;
dest.deleteOnExit();
reader = new BufferedReader(new FileReader(dest));
} catch (Exception e) {
getLogger().error("JavaDocumentConverter cannot convert the
source file: "
+ source.getAbsolutePath(), e);
reader = null;
}finally{
try {
if(writer != null)
writer.close();
if(inputStream != null)
inputStream.close();
} catch (IOException ex) {
if(getLogger().isDebugEnabled())
getLogger().error("Cannot close the stream: "
+ ex);
}
}
return reader;
}
/**
* @param inputStream
* @param writer
* @since CWK 1.4.1
* @see
*/
public abstract boolean extractText(InputStream inputStream, BufferedWriter
writer) throws IOException;
}
/* @(#) CWK 1.5 23.06.2004
*
* Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
* Universit�tsstr. 94/7 9020 Klagenfurt Austria
* www.configworks.com
* All rights reserved.
*/
package com.configworks.cwk.be.search.converters;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.util.LittleEndian;
/**
* Class description
*
* @author sergiu
* @version 1.0
* @since CWK 1.5
*/
public class PPTConverterImpl extends JavaDocumentConverter {
static final String lineSeparator = System.getProperty("line.separator");
/**
* Extract the text from a number of presentations.
*/
public boolean extractText(InputStream reader, BufferedWriter writer) throws
IOException{
POIFSReader r = new POIFSReader();
/* Register a listener for *all* documents. */
MyPOIFSReaderListener listener = new MyPOIFSReaderListener(writer);
r.registerListener(listener);
r.read(reader);
//if no exception was trown, consider that the conversion was
successful
return true;
}
class MyPOIFSReaderListener implements POIFSReaderListener{
private BufferedWriter writer = null;
public MyPOIFSReaderListener(BufferedWriter writer){
this.writer = writer;
}
public void processPOIFSReaderEvent(POIFSReaderEvent event) {
PropertySet ps = null;
try{
org.apache.poi.poifs.filesystem.DocumentInputStream dis=null;
if(!event.getName().equalsIgnoreCase("PowerPoint Document"))
return;
dis=event.getStream();
byte btoWrite[]= new byte[12];
dis.read(btoWrite);
btoWrite = new byte[dis.available()];
dis.read(btoWrite, 0, dis.available());
//StringBuffer buff = new StringBuffer("");
for(int i=0; i<btoWrite.length-20; i++){
long type=LittleEndian.getUShort(btoWrite,i+2);
long size=LittleEndian.getUInt(btoWrite,i+4);
if (type==4008){
int offset = i+4+1;
int length = (int)size+3;
int end = offset + length;
byte[] textBytes = new byte[length];
for (int j = offset; j < end; j++) {
byte b = btoWrite[j];
writer.write((char) b);
}
if(i < (end -1))
i = end -1;
}
}
PropertySetFactory.create(event.getStream());
}catch (Exception e){
String msg = "Cannot index ppt file: \n";
if(getLogger().isErrorEnabled())
getLogger().error(msg + e);
}
}
}
}
/* @(#) CWK 1.4 24.06.2004
*
* Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
* Universit�tsstr. 94/7 9020 Klagenfurt Austria
* www.configworks.com
* All rights reserved.
*/
package com.configworks.cwk.be.search.converters;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;
/**
* Class description
*
* @author sergiu
* @version 1.0
* @since CWK 1.5
*/
public class WordConverterImpl extends JavaDocumentConverter {
public boolean extractText(InputStream in, BufferedWriter writer) throws
IOException{
ArrayList text = new ArrayList();
POIFSFileSystem fsys = new POIFSFileSystem(in);
DocumentEntry headerProps =
(DocumentEntry)fsys.getRoot().getEntry("WordDocument");
DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
byte[] header = new byte[headerProps.getSize()];
din.read(header);
din.close();
// Prende le informazioni dall'header del documento
int info = LittleEndian.getShort(header, 0xa);
boolean useTable1 = (info & 0x200) != 0;
// Prende informazioni dalla piece table
int complexOffset = LittleEndian.getInt(header, 0x1a2);
String tableName = null;
if (useTable1)
tableName = "1Table";
else
tableName = "0Table";
DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
byte[] tableStream = new byte[table.getSize()];
din = fsys.createDocumentInputStream(tableName);
din.read(tableStream);
din.close();
din = null;
fsys = null;
table = null;
headerProps = null;
int multiple = findText(tableStream, complexOffset, text);
StringBuffer sb = new StringBuffer();
int size = text.size();
tableStream = null;
for (int x = 0; x < size; x++){
WordTextPiece nextPiece = (WordTextPiece)text.get(x);
int start = nextPiece.getStart();
int length = nextPiece.getLength();
boolean unicode = nextPiece.usesUnicode();
String toStr = null;
if (unicode)
toStr = new String(header, start, length * multiple,
"UTF-16LE");
else
toStr = new String(header, start, length , "ISO-8859-1");
//sb.append(toStr).append(" ");
toStr += " ";
writer.write(toStr);
}
//if no exeption occured we say that the conversion was successfully realized
return true;
}
private static int findText(byte[] tableStream, int complexOffset,
ArrayList text) throws
IOException{
//actual text
int pos = complexOffset;
int multiple = 2;
//skips through the prms before we reach the piece table. These contain data
//for actual fast saved files
while(tableStream[pos] == 1){
pos++;
int skip = LittleEndian.getShort(tableStream, pos);
pos += 2 + skip;
}
if(tableStream[pos] != 2){
throw new IOException("corrupted Word file");
}else{
//parse out the text pieces
int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
pos += 4;
int pieces = (pieceTableSize - 4) / 12;
for (int x = 0; x < pieces; x++){
int filePos = LittleEndian.getInt(tableStream, pos + ((pieces
+ 1) * 4) +
(x * 8) + 2);
boolean unicode = false;
if ((filePos & 0x40000000) == 0){
unicode = true;
}else{
unicode = false;
multiple = 1;
filePos &= ~(0x40000000);//gives me FC in doc stream
filePos /= 2;
}
int totLength = LittleEndian.getInt(tableStream, pos + (x + 1)
* 4)
-
LittleEndian.getInt(tableStream, pos + (x * 4));
WordTextPiece piece = new WordTextPiece(filePos, totLength,
unicode);
text.add(piece);
}
}
return multiple;
}
}
/* @(#) CWK 1.4 07.06.2004
*
* Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
* Universit�tsstr. 94/7 9020 Klagenfurt Austria
* www.configworks.com
* All rights reserved.
*/
package com.configworks.cwk.be.search.converters;
/**
* Class description
*
* @author sergiu
* @version 1.0
*
* @since CWK 1.4
*/
class WordTextPiece{
private int _fcStart;
private boolean _usesUnicode;
private int _length;
public WordTextPiece(int start, int length, boolean unicode){
_usesUnicode = unicode;
_length = length;
_fcStart = start;
}
public boolean usesUnicode(){
return _usesUnicode;
}
public int getStart(){
return _fcStart;
}
public int getLength(){
return _length;
}
}
package com.configworks.cwk.be.search.converters;
import java.io.File;
import java.io.Reader;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* Created by IntelliJ IDEA.
* User: Kostya
* Date: 11.09.2003
* Time: 19:24:56
* To change this template use Options | File Templates.
*/
public abstract class Converter {
protected ConverterConfig _config;
private static Log logger = null;
public abstract Reader convertSource(File source);
protected void Initialize(ConverterConfig config) {
_config = config;
};
/**
* @return Returns the logger.
*/
public Log getLogger() {
if (logger == null)
logger = LogFactory.getLog(XLSConverterImpl.class);
return logger;
}
}
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]
