WordExtractor.java

nick Mon, 13 Feb 2006 04:59:16 -0800

Author: nick
Date: Mon Feb 13 04:58:52 2006
New Revision: 377371

URL: http://svn.apache.org/viewcvs?rev=377371&view=rev
Log:
Friendly wrapper on HWPF for extracting text from Word Documents


Added:
    jakarta/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/
    
jakarta/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java

Added: 
jakarta/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
URL: 
http://svn.apache.org/viewcvs/jakarta/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java?rev=377371&view=auto
==============================================================================
--- 
jakarta/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
 (added)
+++ 
jakarta/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
 Mon Feb 13 04:58:52 2006
@@ -0,0 +1,123 @@
+package org.apache.poi.hwpf.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Iterator;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * Class to extract the text from a Word Document.
+ * 
+ * You should use either getParagraphText() or getText() unless
+ *  you have a strong reason otherwise.
+ *
+ * @author Nick Burch (nick at torchbox dot com)
+ */
+public class WordExtractor {
+       private POIFSFileSystem fs;
+       private HWPFDocument doc;
+       
+       /**
+        * Create a new Word Extractor
+        * @param is InputStream containing the word file
+        */
+       public WordExtractor(InputStream is) throws IOException {
+               this(new POIFSFileSystem(is));
+       }
+
+       /**
+        * Create a new Word Extractor
+        * @param fs POIFSFileSystem containing the word file
+        */
+       public WordExtractor(POIFSFileSystem fs) throws IOException {
+               this.fs = fs;
+               doc = new HWPFDocument(fs);
+       }
+       
+       /**
+        * Get the text from the word file, as an array with one String
+        *  per paragraph
+        */
+       public String[] getParagraphText() {
+               String[] ret;
+               
+               // Extract using the model code
+               try {
+               Range r = doc.getRange();
+
+                       ret = new String[r.numParagraphs()];
+                       for(int i=0; i<ret.length; i++) {
+                               Paragraph p = r.getParagraph(i);
+                               ret[i] = p.text();
+                               
+                               // Fix the line ending
+                               if(ret[i].endsWith("\r")) {
+                                       ret[i] = ret[i] + "\n";
+                               }
+                       }
+               } catch(Exception e) {
+                       // Something's up with turning the text pieces into 
paragraphs
+                       // Fall back to ripping out the text pieces
+                       ret = new String[1];
+                       ret[0] = getTextFromPieces();
+               }
+               
+               return ret;
+       }
+       
+       /**
+        * Grab the text out of the text pieces. Might also include various
+        *  bits of crud, but will work in cases where the text piece -> 
paragraph
+        *  mapping is broken. Fast too.
+        */
+       public String getTextFromPieces() {
+       StringBuffer textBuf = new StringBuffer();
+       
+       Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
+       while (textPieces.hasNext()) {
+               TextPiece piece = (TextPiece) textPieces.next();
+
+               String encoding = "Cp1252";
+               if (piece.usesUnicode()) {
+                       encoding = "UTF-16LE";
+               }
+               try {
+                       String text = new String(piece.getRawBytes(), encoding);
+                       textBuf.append(text);
+               } catch(UnsupportedEncodingException e) {
+                       throw new InternalError("Standard Encoding " + encoding 
+ " not found, JVM broken");
+               }
+       }
+       
+       String text = textBuf.toString();
+       
+       // Fix line endings (Note - won't get all of them
+       text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
+       text = text.replaceAll("\r\r", "\r\n\r\n");
+       
+       if(text.endsWith("\r")) {
+               text += "\n";
+       }
+       
+       return text;
+       }
+       
+       /*
+        * Grab the text, based on the paragraphs. Shouldn't include any crud,
+        *  but slightly slower than getTextFromPieces()
+        */
+       public String getText() {
+               StringBuffer ret = new StringBuffer();
+               String[] text = getParagraphText();
+               for(int i=0; i<text.length; i++) {
+                       ret.append(text[i]);
+               }
+               return ret.toString();
+       }
+}



---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
Mailing List:    http://jakarta.apache.org/site/mail2.html#poi
The Apache Jakarta POI Project: http://jakarta.apache.org/poi/

svn commit: r377371 - in /jakarta/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/extractor: ./ WordExtractor.java

Reply via email to