Author: jerome Date: Fri Feb 10 09:08:23 2006 New Revision: 376768 URL: http://svn.apache.org/viewcvs?rev=376768&view=rev Log: NUTCH-52, Add a parser plugin for MS Excel files
Added: lucene/nutch/trunk/src/plugin/parse-msexcel/ lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml (with props) lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml (with props) lucene/nutch/trunk/src/plugin/parse-msexcel/sample/ lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls (with props) lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content lucene/nutch/trunk/src/plugin/parse-msexcel/src/ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java (with props) lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java (with props) lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java (with props) lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html (with props) lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/ lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/ lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/ lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/ lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/ lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java (with props) Modified: lucene/nutch/trunk/build.xml lucene/nutch/trunk/default.properties lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=376768&r1=376767&r2=376768&view=diff ============================================================================== --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Fri Feb 10 09:08:23 2006 @@ -254,6 +254,7 @@ <packageset dir="${plugins.dir}/parse-pdf/src/java"/> <!-- <packageset dir="${plugins.dir}/parse-rtf/src/java"/> plugin excluded from build due to licensing issues--> <!-- <packageset dir="${plugins.dir}/parse-mp3/src/java"/> plugin excluded from build due to licensing issues--> + <packageset dir="${plugins.dir}/parse-msexcel/src/java"/> <packageset dir="${plugins.dir}/parse-mspowerpoint/src/java"/> <packageset dir="${plugins.dir}/parse-msword/src/java"/> <packageset dir="${plugins.dir}/parse-rss/src/java"/> Modified: lucene/nutch/trunk/default.properties URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=376768&r1=376767&r2=376768&view=diff ============================================================================== --- lucene/nutch/trunk/default.properties (original) +++ lucene/nutch/trunk/default.properties Fri Feb 10 09:08:23 2006 @@ -63,6 +63,7 @@ plugin.libhttp=org.apache.nutch.protocol.http.api* plugin.more=org.apache.nutch.indexer.more*:org.apache.nutch.searcher.more* plugin.mp3=org.apache.nutch.parse.mp3* +plugin.msexcel=org.apache.nutch.parse.msexcel* plugin.mspowerpoint=org.apache.nutch.parse.mspowerpoint* plugin.msword=org.apache.nutch.parse.msword* # Unfortunately, ontology on core and plugin uses the same package: @@ -91,6 +92,7 @@ ${plugin.libhttp}:\ ${plugin.more}:\ ${plugin.mp3}:\ + ${plugin.msexcel}:\ ${plugin.mspowerpoint}:\ ${plugin.msword}:\ ${plugin.pdf}:\ Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=376768&r1=376767&r2=376768&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Fri Feb 10 09:08:23 2006 @@ -24,6 +24,7 @@ <ant dir="parse-html" target="deploy"/> <ant dir="parse-js" target="deploy"/> <!-- <ant dir="parse-mp3" target="deploy"/> --> + <ant dir="parse-msexcel" target="deploy"/> <ant dir="parse-mspowerpoint" target="deploy"/> <ant dir="parse-msword" target="deploy"/> <ant dir="parse-pdf" target="deploy"/> @@ -52,6 +53,7 @@ <ant dir="parse-ext" target="test"/> <ant dir="parse-html" target="test"/> <!-- <ant dir="parse-mp3" target="test"/> --> + <ant dir="parse-msexcel" target="test"/> <ant dir="parse-mspowerpoint" target="test"/> <ant dir="parse-msword" target="test"/> <ant dir="parse-pdf" target="test"/> @@ -86,6 +88,7 @@ <ant dir="parse-html" target="clean"/> <ant dir="parse-js" target="clean"/> <ant dir="parse-mp3" target="clean"/> + <ant dir="parse-msexcel" target="clean"/> <ant dir="parse-mspowerpoint" target="clean"/> <ant dir="parse-msword" target="clean"/> <ant dir="parse-pdf" target="clean"/> Added: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml?rev=376768&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml (added) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml Fri Feb 10 09:08:23 2006 @@ -0,0 +1,20 @@ +<?xml version="1.0"?> + +<project name="parse-msexcel" default="jar"> + + <import file="../build-plugin.xml" /> + + <path id="plugin.deps"> + <fileset dir="../lib-jakarta-poi/lib"> + <include name="*.jar" /> + </fileset> + </path> + + <!-- for junit test --> + <mkdir dir="${build.test}/data" /> + <copy todir="${build.test}/data"> + <fileset dir="sample"> + <include name="*.xls" /> + </fileset> + </copy> +</project> Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml?rev=376768&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml (added) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml Fri Feb 10 09:08:23 2006 @@ -0,0 +1,29 @@ +<?xml version="1.0" encoding="UTF-8"?> +<plugin + id="parse-msexcel" + name="MSExcel Parse Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="parse-msexcel.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + <import plugin="lib-jakarta-poi"/> + </requires> + + <extension id="org.apache.nutch.parse.msexcel" + name="MSExcelParser" + point="org.apache.nutch.parse.Parser"> + + <implementation id="org.apache.nutch.parse.msexcel.MSExcelParser" + class="org.apache.nutch.parse.msexcel.MSExcelParser" + contentType="application/vnd.ms-excel" + pathSuffix="xls"/> + </extension> + +</plugin> Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls?rev=376768&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content?rev=376768&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content (added) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content Fri Feb 10 09:08:23 2006 @@ -0,0 +1,3 @@ +BitStream test.xls 321654.0 Apache incubator 1234.0 Doug Cutting 89078 CS 599 Search Engines Spring 2005.0 SBC 1234.0 764893.0 Java NUTCH!! + +BitStream test.xls 321654.0 Apache incubator 1234.0 Doug Cutting 89078.0 CS 599 Search Engines Spring 2005.0 SBC 1234.0 764893.0 Java NUTCH!! \ No newline at end of file Added: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java?rev=376768&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java (added) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java Fri Feb 10 09:08:23 2006 @@ -0,0 +1,132 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.msexcel; + +// JDK imports +import java.io.IOException; +import java.io.InputStream; +import java.util.Date; +import java.util.Properties; + +// Jakarta POI imports +import org.apache.poi.hssf.usermodel.HSSFCell; +import org.apache.poi.hssf.usermodel.HSSFRow; +import org.apache.poi.hssf.usermodel.HSSFSheet; +import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.poifs.eventfilesystem.POIFSReader; + +/** + * Excel Text and Properties extractor. + * + * @author Rohit Kulkarni & Ashish Vaidya + * @author Jérôme Charron + */ +public class ExcelExtractor { + + + public String extractText(InputStream input) throws IOException { + + String resultText = ""; + HSSFWorkbook wb = new HSSFWorkbook(input); + if (wb == null) { + return resultText; + } + + HSSFSheet sheet; + HSSFRow row; + HSSFCell cell; + int sNum = 0; + int rNum = 0; + int cNum = 0; + + sNum = wb.getNumberOfSheets(); + + for (int i=0; i<sNum; i++) { + if ((sheet = wb.getSheetAt(i)) == null) { + continue; + } + rNum = sheet.getLastRowNum(); + for (int j=0; j<=rNum; j++) { + if ((row = sheet.getRow(j)) == null){ + continue; + } + cNum = row.getLastCellNum(); + + for (int k=0; k<cNum; k++) { + if ((cell = row.getCell((short) k)) != null) { + /*if(HSSFDateUtil.isCellDateFormatted(cell) == true) { + resultText += cell.getDateCellValue().toString() + " "; + } else + */ + if (cell.getCellType() == HSSFCell.CELL_TYPE_STRING) { + resultText += cell.getStringCellValue() + " "; + } else if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) { + Double d = new Double(cell.getNumericCellValue()); + resultText += d.toString() + " "; + } + /* else if(cell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){ + resultText += cell.getCellFormula() + " "; + } + */ + } + } + } + } + return resultText; + } + + + public Properties extractProperties(InputStream input) throws IOException { + + PropertiesBroker propertiesBroker = new PropertiesBroker(); + POIFSReader reader = new POIFSReader(); + reader.registerListener(new PropertiesReaderListener(propertiesBroker), + "\005SummaryInformation"); + reader.read(input); + return propertiesBroker.getProperties(); + } + + + class PropertiesBroker { + + private Properties properties; + private int timeoutMillis = 2 * 1000; + + + public synchronized Properties getProperties() { + + long start = new Date().getTime(); + long now = start; + + while ((properties == null) && (now-start < timeoutMillis)) { + try { + wait(timeoutMillis / 10); + } catch (InterruptedException e) {} + now = new Date().getTime(); + } + + notifyAll(); + return properties; + } + + public synchronized void setProperties(Properties properties) { + this.properties = properties; + notifyAll(); + } + } + +} + Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java?rev=376768&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java (added) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java Fri Feb 10 09:08:23 2006 @@ -0,0 +1,125 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.msexcel; + +// JDK imports +import java.io.ByteArrayInputStream; +import java.util.Properties; +import java.util.logging.Logger; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.LogFormatter; + +// Nutch imports +import org.apache.nutch.metadata.DublinCore; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.OutlinkExtractor; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.protocol.Content; + +/** + * An Excel document parser. + * + * @author Rohit Kulkarni & Ashish Vaidya + * @author Jérôme Charron + */ +public class MSExcelParser implements Parser { + + private Configuration conf; + + private static final Logger LOG = LogFormatter.getLogger(MSExcelParser.class.getName()); + + /** Creates a new instance of MSExcelParser */ + public MSExcelParser() { } + + public Parse getParse(Content content) { + + String text = null; + String title = null; + Properties properties = null; + + try { + byte[] raw = content.getContent(); + String contentLength = content.getMetadata().get(Metadata.CONTENT_LENGTH); + if ((contentLength != null) && + (raw.length != Integer.parseInt(contentLength))) { + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_TRUNCATED, + "Content truncated at " + raw.length +" bytes. " + + "Parser can't handle incomplete msexcelfile.") + .getEmptyParse(this.conf); + } + + ExcelExtractor extractor = new ExcelExtractor(); + // Extract text + text = extractor.extractText(new ByteArrayInputStream(raw)); + // Extract properties + properties = extractor.extractProperties(new ByteArrayInputStream(raw)); + + //currently returning empty outlinks array + //outlinks = this.fetchOutlinks(resultText); + + } catch (Exception e) { + return new ParseStatus(ParseStatus.FAILED, + "Can't be handled as msexcel document. " + e) + .getEmptyParse(this.conf); + } finally { + // nothing so far + } + + // collect meta data + Metadata metadata = new Metadata(); + title = properties.getProperty(DublinCore.TITLE); + properties.remove(DublinCore.TITLE); + metadata.setAll(properties); + + if (text == null) { text = ""; } + if (title == null) { title = ""; } + + // collect outlink + Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, this.conf); + + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, + outlinks, content.getMetadata(), + metadata); + parseData.setConf(this.conf); + return new ParseImpl(text, parseData); + } + + + /* ---------------------------- * + * <implemenation:Configurable> * + * ---------------------------- */ + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } + + /* ----------------------------- * + * </implemenation:Configurable> * + * ----------------------------- */ + +} Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java?rev=376768&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java (added) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java Fri Feb 10 09:08:23 2006 @@ -0,0 +1,117 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.msexcel; + +// JDK imports +import java.util.Date; +import java.util.Properties; + +// Jakarta POI imports +import org.apache.poi.hpsf.PropertySetFactory; +import org.apache.poi.hpsf.SummaryInformation; +import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent; +import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; + +// Nutch imports +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.msexcel.ExcelExtractor.PropertiesBroker; + + +/** + * @author Rohit Kulkarni & Ashish Vaidya + * @author Jérôme Charron + */ +public class PropertiesReaderListener implements POIFSReaderListener { + + private PropertiesBroker propertiesBroker; + private Properties metaData = new Properties(); + + public PropertiesReaderListener(PropertiesBroker propertiesBroker) { + this.propertiesBroker = propertiesBroker; + } + + public void processPOIFSReaderEvent(POIFSReaderEvent event) { + + SummaryInformation si = null; + Properties properties = new Properties(); + + try { + si = (SummaryInformation)PropertySetFactory.create(event.getStream()); + } catch (Exception ex) { + properties = null; + } + + Date tmp = null; + + String title = si.getTitle(); + String applicationName = si.getApplicationName(); + String author = si.getAuthor(); + int charCount = si.getCharCount(); + String comments = si.getComments(); + Date createDateTime = si.getCreateDateTime(); + long editTime = si.getEditTime(); + String keywords = si.getKeywords(); + String lastAuthor = si.getLastAuthor(); + Date lastPrinted = si.getLastPrinted(); + Date lastSaveDateTime = si.getLastSaveDateTime(); + int pageCount = si.getPageCount(); + String revNumber = si.getRevNumber(); + int security = si.getSecurity(); + String subject = si.getSubject(); + String template = si.getTemplate(); + int wordCount = si.getWordCount(); + + /*Dates are being stored in millis since the epoch to aid + localization*/ + if(title != null) + properties.setProperty(Metadata.TITLE, title); + if(applicationName != null) + properties.setProperty(Metadata.APPLICATION_NAME, applicationName); + if(author != null) + properties.setProperty(Metadata.AUTHOR, author); + if(charCount != 0) + properties.setProperty(Metadata.CHARACTER_COUNT, charCount + ""); + if(comments != null) + properties.setProperty(Metadata.COMMENTS, comments); + if(createDateTime != null) + properties.setProperty(Metadata.DATE, + Metadata.DATE_FORMAT.format(createDateTime)); + if(editTime != 0) + properties.setProperty(Metadata.LAST_MODIFIED, editTime + ""); + if(keywords != null) + properties.setProperty(Metadata.KEYWORDS, keywords); + if(lastAuthor != null) + properties.setProperty(Metadata.LAST_AUTHOR, lastAuthor); + if(lastPrinted != null) + properties.setProperty(Metadata.LAST_PRINTED, lastPrinted.getTime() + ""); + if(lastSaveDateTime != null) + properties.setProperty(Metadata.LAST_SAVED, lastSaveDateTime.getTime() + ""); + if(pageCount != 0) + properties.setProperty(Metadata.PAGE_COUNT, pageCount + ""); + if(revNumber != null) + properties.setProperty(Metadata.REVISION_NUMBER, revNumber); + if(security != 0) + properties.setProperty(Metadata.RIGHTS, security + ""); + if(subject != null) + properties.setProperty(Metadata.SUBJECT, subject); + if(template != null) + properties.setProperty(Metadata.TEMPLATE, template); + if(wordCount != 0) + properties.setProperty(Metadata.WORD_COUNT, wordCount + ""); + propertiesBroker.setProperties(properties); + } + +} Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html?rev=376768&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html (added) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html Fri Feb 10 09:08:23 2006 @@ -0,0 +1,6 @@ +<html> +<body> +<p>An Excel document parsing plugin.</p> +<p>This package relies on Jakarta <a href="http://jakarta.apache.org/poi/index.html">POI</a>.</p> +</body> +</html> Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java?rev=376768&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java (added) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java Fri Feb 10 09:08:23 2006 @@ -0,0 +1,64 @@ +/* + * TestMSExcelParser.java + * Based on the Unit Tests for MSWordParser by John Xing + */ +package org.apache.nutch.parse.msexcel; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; + +import org.apache.nutch.parse.ParserFactory; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseException; + +import junit.framework.TestCase; + +/** + * Based on Unit tests for MSWordParser by John Xing + * + * @author Rohit Kulkarni & Ashish Vaidya + */ +public class TestMSExcelParser extends TestCase { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data","."); + + // Make sure sample files are copied to "test.data" + + private String[] sampleFiles = {"test.xls"}; + + private String expectedText = "BitStream test.xls 321654.0 Apache incubator 1234.0 Doug Cutting 89078.0 CS 599 Search Engines Spring 2005.0 SBC 1234.0 764893.0 Java NUTCH!! "; + + public TestMSExcelParser(String name) { + super(name); + } + + protected void setUp() {} + + protected void tearDown() {} + + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parser parser; + Parse parse; + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + protocol = ProtocolFactory.getProtocol(urlString); + content = protocol.getContent(urlString); + + parser = ParserFactory.getParser(content.getContentType(), urlString); + parse = parser.getParse(content); + + assertTrue(parse.getText().equals(expectedText)); + } + } + +} Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java ------------------------------------------------------------------------------ svn:eol-style = native ------------------------------------------------------- This SF.net email is sponsored by: Splunk Inc. Do you grep through log files for problems? Stop! Download the new AJAX search engine that makes searching your log files as easy as surfing the web. DOWNLOAD SPLUNK! http://sel.as-us.falkag.net/sel?cmd=lnk&kid=103432&bid=230486&dat=121642 _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs