svn commit: r377494 - in /lucene/nutch/trunk/src/plugin: parse-msexcel/ parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ parse-mspowerpoint/ parse-mspowerpoint/src/java/org/apache/nutch/parse/ms
Author: jerome Date: Mon Feb 13 13:28:13 2006 New Revision: 377494 URL: http://svn.apache.org/viewcvs?rev=377494view=rev Log: Make use of lib-parsems in word, powerpoint and excel parsers Removed: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html lucene/nutch/trunk/src/plugin/parse-msword/build.xml lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml?rev=377494r1=377493r2=377494view=diff == --- lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml Mon Feb 13 13:28:13 2006 @@ -2,19 +2,23 @@ project name=parse-msexcel default=jar - import file=../build-plugin.xml / + import file=../build-plugin.xml / path id=plugin.deps fileset dir=../lib-jakarta-poi/lib include name=*.jar / /fileset +fileset dir=../../../build/lib-parsems + include name=*.jar / +/fileset /path - !-- for junit test -- - mkdir dir=${build.test}/data / - copy todir=${build.test}/data - fileset dir=sample - include name=*.xls / - /fileset - /copy + !-- for junit test -- + mkdir dir=${build.test}/data / + copy todir=${build.test}/data +fileset dir=sample + include name=*.xls / +/fileset + /copy + /project Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml?rev=377494r1=377493r2=377494view=diff == --- lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml Mon Feb 13 13:28:13 2006 @@ -14,6 +14,7 @@ requires import plugin=nutch-extensionpoints/ import plugin=lib-jakarta-poi/ + import plugin=lib-parsems/ /requires extension id=org.apache.nutch.parse.msexcel Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java?rev=377494r1=377493r2=377494view=diff == --- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java Mon Feb 13 13:28:13 2006 @@ -16,17 +16,17 @@ package org.apache.nutch.parse.msexcel; // JDK imports -import java.io.IOException; import java.io.InputStream; -import java.util.Date; -import java.util.Properties; // Jakarta POI imports import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import
svn commit: r377501 - in /lucene/nutch/trunk: ./ src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/ src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/
Author: jerome Date: Mon Feb 13 13:43:15 2006 New Revision: 377501 URL: http://svn.apache.org/viewcvs?rev=377501view=rev Log: Javadoc updates for ms parsers Added: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html (with props) Modified: lucene/nutch/trunk/build.xml lucene/nutch/trunk/default.properties lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=377501r1=377500r2=377501view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Mon Feb 13 13:43:15 2006 @@ -249,6 +249,7 @@ packageset dir=${src.dir}/ packageset dir=${plugins.dir}/lib-http/src/java/ + packageset dir=${plugins.dir}/lib-parsems/src/java/ packageset dir=${plugins.dir}/ontology/src/java/ packageset dir=${plugins.dir}/protocol-file/src/java/ packageset dir=${plugins.dir}/protocol-ftp/src/java/ Modified: lucene/nutch/trunk/default.properties URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=377501r1=377500r2=377501view=diff == --- lucene/nutch/trunk/default.properties (original) +++ lucene/nutch/trunk/default.properties Mon Feb 13 13:43:15 2006 @@ -68,6 +68,7 @@ plugin.msword=org.apache.nutch.parse.msword* # Unfortunately, ontology on core and plugin uses the same package: # plugin.ontology=org.apache.nutch.ontology* +plugin.parsems=org.apache.nutch.parse.ms* plugin.pdf=org.apache.nutch.parse.pdf* plugin.rss=org.apache.nutch.parse.rss* plugin.rtf=org.apache.nutch.parse.rtf* @@ -95,6 +96,7 @@ ${plugin.msexcel}:\ ${plugin.mspowerpoint}:\ ${plugin.msword}:\ + ${plugin.parsems}:\ ${plugin.pdf}:\ ${plugin.rss}:\ ${plugin.rtf}:\ Modified: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java?rev=377501r1=377500r2=377501view=diff == --- lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java Mon Feb 13 13:43:15 2006 @@ -56,7 +56,7 @@ /** * Parses a Content with a specific [EMAIL PROTECTED] MSExtractor Microsoft document - * extractor. + * extractor}. */ protected Parse getParse(MSExtractor extractor, Content content) { Added: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html?rev=377501view=auto == --- lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html (added) +++ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html Mon Feb 13 13:43:15 2006 @@ -0,0 +1,5 @@ +html +body +pCommon API for Microsoft copy; documents parsing./p +/body +/html Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/package.html -- svn:eol-style = native Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java?rev=377501r1=377500r2=377501view=diff == --- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java (original) +++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/FilteredStringWriter.java Mon Feb 13 13:43:15 2006 @@ -23,7 +23,6 @@ * * @author Stephan Strittmatter - http://www.sybit.de * @version 1.0 - * @create 19.01.2005 */ public class FilteredStringWriter extends StringWriter { @@ -67,4 +66,4 @@ super.write(ch); } } -} \ No newline at end of file +}