Author: nick Date: Mon Feb 13 04:59:00 2006 New Revision: 377372 URL: http://svn.apache.org/viewcvs?rev=377372&view=rev Log: Friendly wrapper on HWPF for extracting text from Word Documents
Added: jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc (with props) jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/ jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java Added: jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc URL: http://svn.apache.org/viewcvs/jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc?rev=377372&view=auto ============================================================================== Binary file - no diff available. Propchange: jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc ------------------------------------------------------------------------------ svn:executable = * Propchange: jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java URL: http://svn.apache.org/viewcvs/jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java?rev=377372&view=auto ============================================================================== --- jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java (added) +++ jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java Mon Feb 13 04:59:00 2006 @@ -0,0 +1,87 @@ +package org.apache.poi.hwpf.extractor; + +import java.io.FileInputStream; +import java.util.Iterator; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.model.TextPiece; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Range; + +import junit.framework.TestCase; + +/** + * Test the different routes to extracting text + * + * @author Nick Burch (nick at torchbox dot com) + */ +public class TestDifferentRoutes extends TestCase { + private String[] p_text = new String[] { + "This is a simple word document\r", + "\r", + "It has a number of paragraphs in it\r", + "\r", + "Some of them even feature bold, italic and underlined text\r", + "\r", + "\r", + "This bit is in a different font and size\r", + "\r", + "\r", + "This bit features some red text.\r", + "\r", + "\r", + "It is otherwise very very boring.\r" + }; + + private HWPFDocument doc; + + protected void setUp() throws Exception { + String dirname = System.getProperty("HWPF.testdata.path"); + + String filename = dirname + "/test2.doc"; + doc = new HWPFDocument(new FileInputStream(filename)); + } + + /** + * Test model based extraction + */ + public void testExtractFromModel() { + Range r = doc.getRange(); + + String[] text = new String[r.numParagraphs()]; + for(int i=0; i < r.numParagraphs(); i++) { + Paragraph p = r.getParagraph(i); + text[i] = p.text(); + } + + assertEquals(p_text.length, text.length); + for(int i=0; i<p_text.length; i++) { + assertEquals(p_text[i], text[i]); + } + } + + /** + * Test textPieces based extraction + */ + public void testExtractFromTextPieces() throws Exception { + StringBuffer textBuf = new StringBuffer(); + + Iterator textPieces = doc.getTextTable().getTextPieces().iterator(); + while (textPieces.hasNext()) { + TextPiece piece = (TextPiece) textPieces.next(); + + String encoding = "Cp1252"; + if (piece.usesUnicode()) { + encoding = "UTF-16LE"; + } + String text = new String(piece.getRawBytes(), encoding); + textBuf.append(text); + } + + StringBuffer exp = new StringBuffer(); + for(int i=0; i<p_text.length; i++) { + exp.append(p_text[i]); + } + assertEquals(exp.toString(), textBuf.toString()); + } +} Added: jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java URL: http://svn.apache.org/viewcvs/jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java?rev=377372&view=auto ============================================================================== --- jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java (added) +++ jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java Mon Feb 13 04:59:00 2006 @@ -0,0 +1,88 @@ +package org.apache.poi.hwpf.extractor; + +import java.io.FileInputStream; +import java.util.Iterator; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.model.TextPiece; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Range; + +import junit.framework.TestCase; + +/** + * Test the different routes to extracting text + * + * @author Nick Burch (nick at torchbox dot com) + */ +public class TestWordExtractor extends TestCase { + private String[] p_text1 = new String[] { + "This is a simple word document\r\n", + "\r\n", + "It has a number of paragraphs in it\r\n", + "\r\n", + "Some of them even feature bold, italic and underlined text\r\n", + "\r\n", + "\r\n", + "This bit is in a different font and size\r\n", + "\r\n", + "\r\n", + "This bit features some red text.\r\n", + "\r\n", + "\r\n", + "It is otherwise very very boring.\r\n" + }; + private String p_text1_block = new String(); + + // Well behaved document + private WordExtractor extractor; + // Corrupted document - can't do paragraph based stuff + private WordExtractor extractor2; + + protected void setUp() throws Exception { + String dirname = System.getProperty("HWPF.testdata.path"); + + String filename = dirname + "/test2.doc"; + String filename2 = dirname + "/test.doc"; + extractor = new WordExtractor(new FileInputStream(filename)); + extractor2 = new WordExtractor(new FileInputStream(filename2)); + + // Build splat'd out text version + for(int i=0; i<p_text1.length; i++) { + p_text1_block += p_text1[i]; + } + } + + /** + * Test paragraph based extraction + */ + public void testExtractFromParagraphs() { + String[] text = extractor.getParagraphText(); + + assertEquals(p_text1.length, text.length); + for(int i=0; i<p_text1.length; i++) { + assertEquals(p_text1[i], text[i]); + } + + // On second one, should fall back + assertEquals(1, extractor2.getParagraphText().length); + } + + /** + * Test the paragraph -> flat extraction + */ + public void testGetText() { + assertEquals(p_text1_block, extractor.getText()); + + // On second one, should fall back to text piece + assertEquals(extractor2.getTextFromPieces(), extractor2.getText()); + } + + /** + * Test textPieces based extraction + */ + public void testExtractFromTextPieces() throws Exception { + String text = extractor.getTextFromPieces(); + assertEquals(p_text1_block, text); + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] Mailing List: http://jakarta.apache.org/site/mail2.html#poi The Apache Jakarta POI Project: http://jakarta.apache.org/poi/