Hi Sergiu,
I am Natarajan from India and now I was working search engine project. I saw u r article in the net (http://article.gmane.org/gmane.comp.jakarta.poi.user/4851). It's very nice and useful to me.
I want to Indexing exe file so pls send me your "com.configworks.cwk.share.Utils" file.
Advance Thanks.
Regards Natarajan.
------------------------------------------------------------------------
Hi,
I'm glad to find that the code I submitted (I cannot claim is mine) is usefull for other programmers.
I can sent you the utils class, no problem. But we are not indexing exe files. the ExeConverterImpl is an external converter that converts different
file formats to text in a batch mode. For example antiword is such an converter.
Also the ppt converter I submitted throws an OutOfMemoryError. I'll send the code with the bugfix.
I wish you luck in your work, and here is the classes:
Sergiu
/******* Util class *******/ package com.configworks.cwk.share;
import com.configworks.cwk.be.system.CwkConfigurationFactory; import com.configworks.cwk.be.system.Debug; import com.configworks.cwk.be.system.ICwkConfiguration; import java.io.File; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.turbine.services.intake.model.Field;
/**
* KM-Portal Utilities Alle m�glichen Methoden, die das programmieren mit dem KM-Portal erleichtern
* sollen.
*
* @author Christine Keim
* @version 1
*/
public class Utils {
private static final Log logger = LogFactory.getLog(Utils.class.getName());
public static final String DATE_PATTERN = "dd.MM.yyyy"; public static final String DATE_TIME_PATTERN = "dd.MM.yyyy HH:mm:ss";
public static final SimpleDateFormat DATE_FORMAT = new SimpleDateFormat(DATE_PATTERN);
public final static SimpleDateFormat DATE_TIME_FORMAT = new SimpleDateFormat(DATE_TIME_PATTERN);
private static final int MAX_FILENAME_LENGTH = Integer.parseInt(CwkConfigurationFactory.getConfigurationInstance(
CwkConfigurationFactory.PROP_FILE_CONFIGURATION).getProperty(ICwkConfiguration.FILENAMES_LENGTH_MAX));
private static String[] typeimages = {"pdf.gif", "audio.gif", "video.gif", "image.gif",
"office.gif", "data.gif", "archive.gif", "link.gif",
"unknown.gif", "word.gif", "excel.gif", "powerpoint.gif"};
private static String unknownTypeimage = "unknown.gif";
public String getDatePattern() {
return DATE_PATTERN;
}/**
* Checks wether the given String is ull or empty, contains nothing or only blanks...
*
* @param v String to check
* @return true if string is empty, else false
*/
public static boolean isEmpty(String v) {
return (v == null || "".equals(v.trim()));
}
/**
* @param current DOCUMENT ME!
* @return DOCUMENT ME!
* @see com.configworks.cwk.fe.tools.CwkToolkit#getRatingImage(float)
* @deprecated Use cwktoolkit.getRatingImage instead
*/
public static String getRatingImage(float current) {
if (current < 0) {
return "unrated.gif";
} if (current == 0) {
return "stars-0-0.gif";
} if (current <= 1) {
return "stars-0-5.gif";
} if (current <= 2) {
return "stars-1-0.gif";
} if (current <= 3) {
return "stars-1-5.gif";
} if (current <= 4) {
return "stars-2-0.gif";
} if (current <= 5) {
return "stars-2-5.gif";
} if (current <= 6) {
return "stars-3-0.gif";
} if (current <= 7) {
return "stars-3-5.gif";
} if (current <= 8) {
return "stars-4-0.gif";
} if (current <= 9) {
return "stars-4-5.gif";
} else {
return "stars-5-0.gif";
}
} /**
* @param type DOCUMENT ME!
* @return DOCUMENT ME!
* @see com.configworks.cwk.fe.tools.CwkToolkit#getTypeImage(int)
* @deprecated use cwktoolkit.getTypeImage instead
*/
public static String getTypeImage(int type) {
if ((type >= 0) && (type < typeimages.length)) {
return typeimages[type];
} else {
return unknownTypeimage;
}
}/**
* DOCUMENT ME!
*
* @param quality DOCUMENT ME!
* @param maxvalue DOCUMENT ME!
* @return DOCUMENT ME!
*/
public static String calculatePercentage(float quality, float maxvalue) {
float result = (quality * 100) / maxvalue;
if (result < 0) {
result = 0;
}return "" + Math.round(result); }
/**
* DOCUMENT ME!
*
* @param s DOCUMENT ME!
* @return DOCUMENT ME!
*/
public static String capitalize(String s) {
String ret = s.substring(0, 1).toUpperCase() + s.substring(1, s.length());
return ret; }
/**
* DOCUMENT ME!
*
* @param fn DOCUMENT ME!
* @return DOCUMENT ME!
*/
public static String cutFileName(String fn) {
String separator = File.separator;
String cutfn = null; if (fn.startsWith("/")) {
separator = "/";
} else {
separator = "\\";
} if (fn.lastIndexOf(separator) >= 0) {
cutfn = fn.substring(fn.lastIndexOf(separator) + 1);
} else {
cutfn = fn;
}
return cutfn;
}/**
* Cats the Classname from a packagePath+className string
* com.configworks.cwk.share.om.Tutorial becomes Tutorial
* @param fn String to cut ClassName from
*
* @return className
*/
public static String cutClassName(String fn) {
String separator = ".";
String cutfn = null;
if (fn.lastIndexOf(separator) >= 0) {
cutfn = fn.substring(fn.lastIndexOf(separator) + 1);
} else {
cutfn = fn;
}
return cutfn;
}
/**
* @param date DOCUMENT ME!
* @return DOCUMENT ME!
* @see com.configworks.cwk.fe.tools.CwkToolkit#formatDate(Date, Locale)
* @deprecated use cwktoolkit.formatDate instead
*/
public static String dateToString(Date date) {
if (date != null) {
return DATE_FORMAT.format(date);
} else {
return null;
}
}
/**
* @param date DOCUMENT ME!
* @param format DOCUMENT ME!
* @return DOCUMENT ME!
* @deprecated use cwktoolkit.formatDate or cwktoolkit.formatDateTime instead
* DOCUMENT ME!
*/
public static String dateToString(Date date, String format) {
if (date != null) {
SimpleDateFormat df = new SimpleDateFormat(format);
return df.format(date);
} else {
return null;
}
}/**
* @param date DOCUMENT ME!
* @return DOCUMENT ME!
* @deprecated use cwktoolkit.formatDate or cwktoolkit.formatDateTime instead
* <p/>
* DOCUMENT ME!
*/
public static String dateToString(Field date) {
if ((date != null) && (date.getValue() != null)) {
return DATE_FORMAT.format(date.getValue());
} else {
return null;
}
}
/**
* converts an Java conform name to a torque underscore name
*
* @param javaname
* @return
*/
public static String jToU(String javaname) {
char[] chars = javaname.toCharArray();
StringBuffer underscore = new StringBuffer();
underscore.append(Character.toLowerCase(chars[0])); for (int i = 1; i < chars.length; i++) {
if (Character.isUpperCase(chars[i])) {
underscore.append("_");
underscore.append(Character.toLowerCase(chars[i]));
} else {
underscore.append(chars[i]);
}
}Debug.println(javaname + " =>" + underscore);
return underscore.toString(); }
/**
* DOCUMENT ME!
*
* @param in DOCUMENT ME!
* @return DOCUMENT ME!
*/
public static String nl2br(String in) {
return in.replaceAll("\n", "<br>");
} /**
* @param in DOCUMENT ME!
* @return DOCUMENT ME!
* @see com.configworks.cwk.fe.tools.CwkToolkit#htmlEncode(String)
* @deprecated use cwktoolkit.htmlencode instead
*/
public static String out(String in) {return nl2br(in); }
/**
* replaces a torque name with underscores with an Java conform name
*
* @param uname
* @return
*/
public static String uToJ(String uname) {
char[] chars = uname.toCharArray();
StringBuffer java = new StringBuffer(); for (int i = 0; i < chars.length; i++) {
if (chars[i] == '_') {
i++;
java.append(Character.toUpperCase(chars[i]));
} else {
java.append(chars[i]);
}
}Debug.println(uname + " =>" + java);
return java.toString(); }
/**
* this method is used to execute an OS COmmand
*
* @param execPath the execution path (path to executable file)
* @param sourcePath the source path (path to imput file)
* @param destinationPath the destination path (path to output file)
* @param params aditional parameters (if null or "" this parameter is ignored)
* @return a refference of the created proccess
* @throws IOException
*/
public static Process executeOSCommand(String execPath, String sourcePath,
String destinationPath, String params)
throws IOException {
final String source = "<source>";
final String destination = "<destination>";
Runtime runtime = Runtime.getRuntime();
if (execPath == null) {
if (logger.isErrorEnabled())
logger.error("Execution command is not specified!");
return null;
} int sourceIndex = execPath.indexOf(source);
int destinationIndex = execPath.indexOf(destination);if ((sourceIndex >= 0) && (destinationIndex >= 0)) {
String execCommand = execPath.substring(0, sourceIndex) + sourcePath + execPath.substring(
sourceIndex + source.length(), destinationIndex) + destinationPath + execPath.substring(
destinationIndex + destination.length());
if (Utils.notEmptyString(params))
execCommand += " " + params;
//for windows change all / in the path to \ otherwise the command cannot be executed
if (File.separator.equals("\\"))
execCommand = execCommand.replace('/', '\\');
System.out.println(execCommand);
if (logger.isTraceEnabled())
logger.trace("Executing command: " + execCommand);
return runtime.exec(execCommand);
}
return null;
} /**
* !isEmptyString
*
* @param s
* @return
* @see #isEmpty(String)
* @since CWK 1.4.0
*/
public static boolean notEmptyString(String s) {
return !isEmpty(s);
} /**
* @param s
* @return
* @see
* @see #isEmpty(String)
* @since CWK 1.4.0
* @deprecated use isempty instead
* True if s==null or ""
*/
public static boolean isEmptyString(String s) {
return (s == null || s.equals(""));
}}
/****************************** ppt converter implementation ***************/
/* @(#) CWK 1.5 23.06.2004 * * Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH * Universit�tsstr. 94/7 9020 Klagenfurt Austria * www.configworks.com * All rights reserved. */
package com.configworks.cwk.be.search.converters;
import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import org.apache.poi.hpsf.PropertySetFactory; import org.apache.poi.poifs.eventfilesystem.POIFSReader; import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent; import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; import org.apache.poi.util.LittleEndian;
/**
* Class description
*
* @author sergiu
* @version 1.0
* @since CWK 1.5
*/
public class PPTConverterImpl extends Converter {static final String lineSeparator = System.getProperty("line.separator");
private BufferedOutputStream txtFileWriter = null;
File dest = null;
/* (non-Javadoc)
* @see com.configworks.cwk.be.search.converters.Converter#convertSource(java.io.File)
*/
public Reader convertSource(File source) {
if (source == null)
return null;
Reader reader = null;
InputStream inputStream = null;
try {
String filename = source.getName();
filename = filename.replace('.', '_');
filename += ".txt";
File tmpDir = new File(_config.getTempDirectory());
tmpDir.mkdirs();
dest = new File(tmpDir.getPath(), filename);
boolean created = dest.createNewFile(); //create the input and output streams
txtFileWriter = new BufferedOutputStream(
new FileOutputStream(dest));
inputStream = new FileInputStream(source);
extractText(inputStream);
if (!dest.exists())
return null;
dest.deleteOnExit();
reader = new BufferedReader(new FileReader(dest));
} catch (Exception e) {
getLogger().error("JavaDocumentConverter cannot convert the source file: "
+ source.getAbsolutePath(), e);
reader = null;
}finally{
try {
if(txtFileWriter != null)
txtFileWriter.close();
if(inputStream != null)
inputStream.close();
} catch (IOException ex) {
if(getLogger().isDebugEnabled())
getLogger().error("Cannot close the stream: " + ex);
}
}
return reader;
}
/**
* Extract the text from a number of presentations.
*/
public boolean extractText(InputStream reader) throws IOException{
if(txtFileWriter == null)
throw new IOException("Writer Not Iititalized!");
POIFSReader r = new POIFSReader();
/* Register a listener for *all* documents. */
PptDocReaderListener listener = new PptDocReaderListener(txtFileWriter);
r.registerListener(listener, "PowerPoint Document");
r.read(reader);
//if no exception was trown, consider that the conversion was successful return true;
} class PptDocReaderListener implements POIFSReaderListener{
private BufferedOutputStream writer = null;
public PptDocReaderListener(){
}
public PptDocReaderListener(BufferedOutputStream writer){
this.writer = writer;
}
public void processPOIFSReaderEvent(POIFSReaderEvent event) {
try{
org.apache.poi.poifs.filesystem.DocumentInputStream dis = null;
if(!event.getName().equalsIgnoreCase("PowerPoint Document"))
return;
dis=event.getStream();
byte btoWrite[]= new byte[12];
dis.read(btoWrite);
btoWrite = new byte[dis.available()];
dis.read(btoWrite, 0, dis.available());
long type = 0;
long size = 0;
int offset = 0;
int length = 0;
for(int i=0; i<btoWrite.length-20; i++){
type=LittleEndian.getUShort(btoWrite,i+2);
size=LittleEndian.getUInt(btoWrite,i+4);
if (type==4008){
length = (int)size+3;
offset = i+4+1;
writer.write(btoWrite, offset, length);
//skip the bytes that were already read i = Math.max(i, (offset + length));
}
}
PropertySetFactory.create(event.getStream());
}catch (Exception e){
String msg = "Cannot index ppt file: \n";
if(getLogger().isErrorEnabled())
getLogger().error(msg + e);
} } }
/* (non-Javadoc)
* @see com.configworks.cwk.be.search.converters.JavaDocumentConverter#initWriter(java.io.File)
*/
public void initWriter(File dest) throws IOException {
if (txtFileWriter == null)
txtFileWriter = new BufferedOutputStream(new FileOutputStream(dest));
}
/* (non-Javadoc)
* @see com.configworks.cwk.be.search.converters.JavaDocumentConverter#closeWriter()
*/
public void closeWriter() throws IOException {
if(txtFileWriter != null)
txtFileWriter.close();
else
throw new IOException("Cannot close the writer, the object is Null!");
}
}
Do you Yahoo!?
Take Yahoo! Mail with you! <http://us.rd.yahoo.com/mail_us/taglines/mobile/*http://mobile.yahoo.com/maildemo> Get it on your mobile phone.
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]
