Author: siren Date: Wed Feb 18 12:43:04 2009 New Revision: 745499 URL: http://svn.apache.org/viewvc?rev=745499&view=rev Log: NUTCH-691 - Update jakarta poi jars to the most relevant version, contributed by Dmitry Lihachev
Added: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar (with props) lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar (with props) Removed: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.0-alpha1-20050704.jar lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.0-alpha1-20050704.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml lucene/nutch/trunk/src/plugin/parse-msword/build.xml lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745499&r1=745498&r2=745499&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Feb 18 12:43:04 2009 @@ -343,6 +343,9 @@ 128. NUTCH-631 - MoreIndexingFilter fails with NoSuchElementException (Stefan Will, siren) +129. NUTCH-691 - Update jakarta poi jars to the most relevant version + (Dmitry Lihachev via siren) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Added: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar?rev=745499&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar?rev=745499&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml?rev=745499&r1=745498&r2=745499&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml Wed Feb 18 12:43:04 2009 @@ -29,10 +29,10 @@ provider-name="jakarta.apache.org"> <runtime> - <library name="poi-3.0-alpha1-20050704.jar"> + <library name="poi-3.5-beta4-20081128.jar"> <export name="*"/> </library> - <library name="poi-scratchpad-3.0-alpha1-20050704.jar"> + <library name="poi-scratchpad-3.5-beta4-20081128.jar"> <export name="*"/> </library> </runtime> Modified: lucene/nutch/trunk/src/plugin/parse-msword/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/build.xml?rev=745499&r1=745498&r2=745499&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/build.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/build.xml Wed Feb 18 12:43:04 2009 @@ -44,7 +44,8 @@ <!-- for junit test --> <mkdir dir="${build.test}/data"/> - <copy file="sample/word95.doc" todir="${build.test}/data"/> - <copy file="sample/word97.doc" todir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="sample" includes="*.doc" /> + </copy> </project> Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java?rev=745499&r1=745498&r2=745499&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java Wed Feb 18 12:43:04 2009 @@ -53,8 +53,9 @@ int chpTableSize = LittleEndian.getInt(mainStream, 0xbc); // get a list of character properties + Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset, - chpTableSize, fcMin); + chpTableSize, fcMin, new TextPieceTable()); List textRuns = chpTable.getTextRuns(); // iterate through the Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java?rev=745499&r1=745498&r2=745499&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java Wed Feb 18 12:43:04 2009 @@ -119,11 +119,12 @@ int chpOffset = LittleEndian.getInt(header, 0xfa); int chpSize = LittleEndian.getInt(header, 0xfe); int fcMin = LittleEndian.getInt(header, 0x18); - CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin); // load our text pieces and our character runs ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin); TextPieceTable tpt = cft.getTextPieceTable(); + CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt); + List textPieces = tpt.getTextPieces(); // make the POIFS objects available for garbage collection Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java?rev=745499&r1=745498&r2=745499&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java Wed Feb 18 12:43:04 2009 @@ -45,7 +45,7 @@ * @param fcMin The start of text in the main stream. */ public Word6CHPBinTable(byte[] documentStream, int offset, - int size, int fcMin) + int size, int fcMin, TextPieceTable tpt) { PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2); @@ -58,7 +58,7 @@ int pageOffset = POIFSConstants.BIG_BLOCK_SIZE * pageNum; CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream, - pageOffset, fcMin); + pageOffset, fcMin, tpt); int fkpSize = cfkp.size(); Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java?rev=745499&r1=745498&r2=745499&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Wed Feb 18 12:43:04 2009 @@ -32,6 +32,9 @@ import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; +import java.io.File; +import java.io.FilenameFilter; + import junit.framework.TestCase; /** @@ -50,31 +53,38 @@ private String[] sampleFiles = {"word95.doc","word97.doc"}; private String expectedText = "This is a sample doc file prepared for nutch."; + + private Configuration conf; public TestMSWordParser(String name) { super(name); } - protected void setUp() {} + protected void setUp() { + conf = NutchConfiguration.create(); + conf.set("file.content.limit", "-1"); + } protected void tearDown() {} + public String getTextContent(String fileName) throws ProtocolException, ParseException { + String urlString = "file:" + sampleDir + fileSeparator + fileName; + Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); + Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); + Parse parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content).get(content.getUrl()); + return parse.getText(); + } + public void testIt() throws ProtocolException, ParseException { - String urlString; - Protocol protocol; - Content content; - Parse parse; - - Configuration conf = NutchConfiguration.create(); for (int i=0; i<sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content).get(content.getUrl()); - - assertTrue(parse.getText().startsWith(expectedText)); + assertTrue(getTextContent(sampleFiles[i]).startsWith(expectedText)); } } + public void testOpeningDocs() throws ProtocolException, ParseException { + String[] filenames = new File(sampleDir).list(); + for (int i = 0; i < filenames.length; i++) { + assertTrue("cann't read content of " + filenames[i], getTextContent(filenames[i]).length() > 0); + } + } }