Author: nick Date: Mon Feb 13 04:58:52 2006 New Revision: 377371 URL: http://svn.apache.org/viewcvs?rev=377371&view=rev Log: Friendly wrapper on HWPF for extracting text from Word Documents
Added: jakarta/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/ jakarta/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java Added: jakarta/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java URL: http://svn.apache.org/viewcvs/jakarta/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java?rev=377371&view=auto ============================================================================== --- jakarta/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java (added) +++ jakarta/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java Mon Feb 13 04:58:52 2006 @@ -0,0 +1,123 @@ +package org.apache.poi.hwpf.extractor; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.Iterator; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.model.TextPiece; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +/** + * Class to extract the text from a Word Document. + * + * You should use either getParagraphText() or getText() unless + * you have a strong reason otherwise. + * + * @author Nick Burch (nick at torchbox dot com) + */ +public class WordExtractor { + private POIFSFileSystem fs; + private HWPFDocument doc; + + /** + * Create a new Word Extractor + * @param is InputStream containing the word file + */ + public WordExtractor(InputStream is) throws IOException { + this(new POIFSFileSystem(is)); + } + + /** + * Create a new Word Extractor + * @param fs POIFSFileSystem containing the word file + */ + public WordExtractor(POIFSFileSystem fs) throws IOException { + this.fs = fs; + doc = new HWPFDocument(fs); + } + + /** + * Get the text from the word file, as an array with one String + * per paragraph + */ + public String[] getParagraphText() { + String[] ret; + + // Extract using the model code + try { + Range r = doc.getRange(); + + ret = new String[r.numParagraphs()]; + for(int i=0; i<ret.length; i++) { + Paragraph p = r.getParagraph(i); + ret[i] = p.text(); + + // Fix the line ending + if(ret[i].endsWith("\r")) { + ret[i] = ret[i] + "\n"; + } + } + } catch(Exception e) { + // Something's up with turning the text pieces into paragraphs + // Fall back to ripping out the text pieces + ret = new String[1]; + ret[0] = getTextFromPieces(); + } + + return ret; + } + + /** + * Grab the text out of the text pieces. Might also include various + * bits of crud, but will work in cases where the text piece -> paragraph + * mapping is broken. Fast too. + */ + public String getTextFromPieces() { + StringBuffer textBuf = new StringBuffer(); + + Iterator textPieces = doc.getTextTable().getTextPieces().iterator(); + while (textPieces.hasNext()) { + TextPiece piece = (TextPiece) textPieces.next(); + + String encoding = "Cp1252"; + if (piece.usesUnicode()) { + encoding = "UTF-16LE"; + } + try { + String text = new String(piece.getRawBytes(), encoding); + textBuf.append(text); + } catch(UnsupportedEncodingException e) { + throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken"); + } + } + + String text = textBuf.toString(); + + // Fix line endings (Note - won't get all of them + text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n"); + text = text.replaceAll("\r\r", "\r\n\r\n"); + + if(text.endsWith("\r")) { + text += "\n"; + } + + return text; + } + + /* + * Grab the text, based on the paragraphs. Shouldn't include any crud, + * but slightly slower than getTextFromPieces() + */ + public String getText() { + StringBuffer ret = new StringBuffer(); + String[] text = getParagraphText(); + for(int i=0; i<text.length; i++) { + ret.append(text[i]); + } + return ret.toString(); + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] Mailing List: http://jakarta.apache.org/site/mail2.html#poi The Apache Jakarta POI Project: http://jakarta.apache.org/poi/