Author: jerome
Date: Fri Feb 10 09:08:23 2006
New Revision: 376768

URL: http://svn.apache.org/viewcvs?rev=376768&view=rev
Log:
NUTCH-52, Add a parser plugin for MS Excel files

Added:
    lucene/nutch/trunk/src/plugin/parse-msexcel/
    lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml   (with props)
    lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml   (with props)
    lucene/nutch/trunk/src/plugin/parse-msexcel/sample/
    lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls   (with props)
    lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
   (with props)
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
   (with props)
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java
   (with props)
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
   (with props)
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
   (with props)
Modified:
    lucene/nutch/trunk/build.xml
    lucene/nutch/trunk/default.properties
    lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=376768&r1=376767&r2=376768&view=diff
==============================================================================
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Fri Feb 10 09:08:23 2006
@@ -254,6 +254,7 @@
        <packageset dir="${plugins.dir}/parse-pdf/src/java"/>
 <!--   <packageset dir="${plugins.dir}/parse-rtf/src/java"/> plugin excluded 
from build due to licensing issues-->
 <!--   <packageset dir="${plugins.dir}/parse-mp3/src/java"/> plugin excluded 
from build due to licensing issues-->
+       <packageset dir="${plugins.dir}/parse-msexcel/src/java"/>
        <packageset dir="${plugins.dir}/parse-mspowerpoint/src/java"/>
        <packageset dir="${plugins.dir}/parse-msword/src/java"/>
        <packageset dir="${plugins.dir}/parse-rss/src/java"/>

Modified: lucene/nutch/trunk/default.properties
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=376768&r1=376767&r2=376768&view=diff
==============================================================================
--- lucene/nutch/trunk/default.properties (original)
+++ lucene/nutch/trunk/default.properties Fri Feb 10 09:08:23 2006
@@ -63,6 +63,7 @@
 plugin.libhttp=org.apache.nutch.protocol.http.api*
 plugin.more=org.apache.nutch.indexer.more*:org.apache.nutch.searcher.more*
 plugin.mp3=org.apache.nutch.parse.mp3*
+plugin.msexcel=org.apache.nutch.parse.msexcel*
 plugin.mspowerpoint=org.apache.nutch.parse.mspowerpoint*
 plugin.msword=org.apache.nutch.parse.msword*
 # Unfortunately, ontology on core and plugin uses the same package:
@@ -91,6 +92,7 @@
    ${plugin.libhttp}:\
    ${plugin.more}:\
    ${plugin.mp3}:\
+   ${plugin.msexcel}:\
    ${plugin.mspowerpoint}:\
    ${plugin.msword}:\
    ${plugin.pdf}:\

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=376768&r1=376767&r2=376768&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Fri Feb 10 09:08:23 2006
@@ -24,6 +24,7 @@
      <ant dir="parse-html" target="deploy"/>
      <ant dir="parse-js" target="deploy"/>
      <!-- <ant dir="parse-mp3" target="deploy"/> -->
+     <ant dir="parse-msexcel" target="deploy"/>
      <ant dir="parse-mspowerpoint" target="deploy"/>
      <ant dir="parse-msword" target="deploy"/>
      <ant dir="parse-pdf" target="deploy"/>
@@ -52,6 +53,7 @@
      <ant dir="parse-ext" target="test"/>
      <ant dir="parse-html" target="test"/>
      <!-- <ant dir="parse-mp3" target="test"/> -->
+     <ant dir="parse-msexcel" target="test"/>
      <ant dir="parse-mspowerpoint" target="test"/>
      <ant dir="parse-msword" target="test"/>
      <ant dir="parse-pdf" target="test"/>
@@ -86,6 +88,7 @@
     <ant dir="parse-html" target="clean"/>
     <ant dir="parse-js" target="clean"/>
     <ant dir="parse-mp3" target="clean"/>
+    <ant dir="parse-msexcel" target="clean"/>
     <ant dir="parse-mspowerpoint" target="clean"/>
     <ant dir="parse-msword" target="clean"/>
     <ant dir="parse-pdf" target="clean"/>

Added: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml Fri Feb 10 09:08:23 
2006
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+
+<project name="parse-msexcel" default="jar">
+
+       <import file="../build-plugin.xml" />
+
+  <path id="plugin.deps">
+    <fileset dir="../lib-jakarta-poi/lib">
+      <include name="*.jar" />
+    </fileset>
+  </path>
+
+       <!-- for junit test -->
+       <mkdir dir="${build.test}/data" />
+       <copy todir="${build.test}/data">
+               <fileset dir="sample">
+                       <include name="*.xls" />
+               </fileset>
+       </copy>
+</project>

Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml Fri Feb 10 09:08:23 
2006
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-msexcel"
+   name="MSExcel Parse Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-msexcel.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+     <import plugin="nutch-extensionpoints"/>
+     <import plugin="lib-jakarta-poi"/>
+   </requires>
+
+   <extension id="org.apache.nutch.parse.msexcel"
+              name="MSExcelParser" 
+              point="org.apache.nutch.parse.Parser">
+
+      <implementation id="org.apache.nutch.parse.msexcel.MSExcelParser"
+                      class="org.apache.nutch.parse.msexcel.MSExcelParser" 
+                      contentType="application/vnd.ms-excel"
+                      pathSuffix="xls"/>
+   </extension>
+
+</plugin>

Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls?rev=376768&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content?rev=376768&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content (added)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/sample/test.xls.content Fri Feb 
10 09:08:23 2006
@@ -0,0 +1,3 @@
+BitStream test.xls 321654.0 Apache incubator 1234.0 Doug Cutting 89078 CS 599 
Search Engines Spring 2005.0 SBC 1234.0 764893.0 Java NUTCH!! 
+
+BitStream test.xls 321654.0 Apache incubator 1234.0 Doug Cutting 89078.0 CS 
599 Search Engines Spring 2005.0 SBC 1234.0 764893.0 Java NUTCH!!
\ No newline at end of file

Added: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java?rev=376768&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
 Fri Feb 10 09:08:23 2006
@@ -0,0 +1,132 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.msexcel;
+
+// JDK imports
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+import java.util.Properties;
+
+// Jakarta POI imports
+import org.apache.poi.hssf.usermodel.HSSFCell;
+import org.apache.poi.hssf.usermodel.HSSFRow;
+import org.apache.poi.hssf.usermodel.HSSFSheet;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+
+/**
+ * Excel Text and Properties extractor.
+ *
+ * @author Rohit Kulkarni & Ashish Vaidya
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class ExcelExtractor {
+
+  
+  public String extractText(InputStream input) throws IOException {
+    
+    String resultText = "";
+    HSSFWorkbook wb = new HSSFWorkbook(input);
+    if (wb == null) {
+      return resultText;
+    }
+    
+    HSSFSheet sheet;
+    HSSFRow row;
+    HSSFCell cell;
+    int sNum = 0;
+    int rNum = 0;
+    int cNum = 0;
+    
+    sNum = wb.getNumberOfSheets();
+    
+    for (int i=0; i<sNum; i++) {
+      if ((sheet = wb.getSheetAt(i)) == null) {
+        continue;
+      }
+      rNum = sheet.getLastRowNum();
+      for (int j=0; j<=rNum; j++) {
+        if ((row = sheet.getRow(j)) == null){
+          continue;
+        }
+        cNum = row.getLastCellNum();
+        
+        for (int k=0; k<cNum; k++) {
+          if ((cell = row.getCell((short) k)) != null) {
+            /*if(HSSFDateUtil.isCellDateFormatted(cell) == true) {
+                resultText += cell.getDateCellValue().toString() + " ";
+              } else
+             */
+            if (cell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
+              resultText += cell.getStringCellValue() + " ";
+            } else if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
+              Double d = new Double(cell.getNumericCellValue());
+              resultText += d.toString() + " ";
+            }
+            /* else if(cell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){
+                 resultText += cell.getCellFormula() + " ";
+               } 
+             */
+          }
+        }
+      }
+    }
+    return resultText;
+  }
+  
+  
+  public Properties extractProperties(InputStream input) throws IOException {
+    
+    PropertiesBroker propertiesBroker = new PropertiesBroker();
+    POIFSReader reader = new POIFSReader();
+    reader.registerListener(new PropertiesReaderListener(propertiesBroker),
+                            "\005SummaryInformation");
+    reader.read(input);
+    return propertiesBroker.getProperties();
+  }
+  
+  
+  class PropertiesBroker {
+    
+    private Properties properties;
+    private int timeoutMillis = 2 * 1000;
+    
+    
+    public synchronized Properties getProperties() {
+      
+      long start = new Date().getTime();
+      long now = start;
+      
+      while ((properties == null) && (now-start < timeoutMillis)) {
+        try {
+          wait(timeoutMillis / 10);
+        } catch (InterruptedException e) {}
+        now = new Date().getTime();
+      }
+      
+      notifyAll();
+      return properties;
+    }
+    
+    public synchronized void setProperties(Properties properties) {
+      this.properties = properties;
+      notifyAll();
+    }
+  }
+
+}
+

Propchange: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java?rev=376768&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
 Fri Feb 10 09:08:23 2006
@@ -0,0 +1,125 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.msexcel;
+
+// JDK imports
+import java.io.ByteArrayInputStream;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * An Excel document parser.
+ *
+ * @author Rohit Kulkarni & Ashish Vaidya
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class MSExcelParser implements Parser {
+  
+  private Configuration conf;
+  
+  private static final Logger LOG = 
LogFormatter.getLogger(MSExcelParser.class.getName());
+
+  /** Creates a new instance of MSExcelParser */
+  public MSExcelParser() { }
+  
+  public Parse getParse(Content content) {
+    
+    String text = null;
+    String title = null;
+    Properties properties = null;
+    
+    try {
+      byte[] raw = content.getContent();
+      String contentLength = 
content.getMetadata().get(Metadata.CONTENT_LENGTH);
+      if ((contentLength != null) &&
+          (raw.length != Integer.parseInt(contentLength))) {
+        return new ParseStatus(ParseStatus.FAILED,
+                               ParseStatus.FAILED_TRUNCATED,
+                               "Content truncated at " + raw.length +" bytes. 
" +
+                               "Parser can't handle incomplete msexcelfile.")
+                               .getEmptyParse(this.conf);
+      }
+
+      ExcelExtractor extractor = new ExcelExtractor();      
+      // Extract text
+      text = extractor.extractText(new ByteArrayInputStream(raw));
+      // Extract properties
+      properties = extractor.extractProperties(new ByteArrayInputStream(raw));
+      
+      //currently returning empty outlinks array
+      //outlinks = this.fetchOutlinks(resultText);
+      
+    } catch (Exception e) {
+      return new ParseStatus(ParseStatus.FAILED,
+                             "Can't be handled as msexcel document. " + e)
+                             .getEmptyParse(this.conf);
+    } finally {
+      // nothing so far
+    }
+    
+    // collect meta data
+    Metadata metadata = new Metadata();
+    title = properties.getProperty(DublinCore.TITLE);
+    properties.remove(DublinCore.TITLE);
+    metadata.setAll(properties);
+
+    if (text == null) { text = ""; }
+    if (title == null) { title = ""; }
+
+    // collect outlink
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, this.conf);
+
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+                                        outlinks, content.getMetadata(),
+                                        metadata);
+    parseData.setConf(this.conf);
+    return new ParseImpl(text, parseData);
+  }
+
+
+  /* ---------------------------- *
+   * <implemenation:Configurable> *
+   * ---------------------------- */
+  
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /* ----------------------------- *
+   * </implemenation:Configurable> *
+   * ----------------------------- */
+
+}

Propchange: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java?rev=376768&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java
 Fri Feb 10 09:08:23 2006
@@ -0,0 +1,117 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.msexcel;
+
+// JDK imports
+import java.util.Date;
+import java.util.Properties;
+
+// Jakarta POI imports
+import org.apache.poi.hpsf.PropertySetFactory;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.msexcel.ExcelExtractor.PropertiesBroker;
+
+
+/**
+ * @author Rohit Kulkarni & Ashish Vaidya
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class PropertiesReaderListener implements POIFSReaderListener {
+    
+    private PropertiesBroker propertiesBroker;
+    private Properties metaData = new Properties();
+
+    public PropertiesReaderListener(PropertiesBroker propertiesBroker) {
+      this.propertiesBroker = propertiesBroker;
+    }
+
+    public void processPOIFSReaderEvent(POIFSReaderEvent event) {
+
+      SummaryInformation si = null;
+      Properties properties = new Properties();
+
+      try {
+        si = (SummaryInformation)PropertySetFactory.create(event.getStream());
+      } catch (Exception ex) {
+        properties = null;
+      }
+
+      Date tmp = null;
+
+      String title = si.getTitle();
+      String applicationName = si.getApplicationName();
+      String author = si.getAuthor();
+      int charCount = si.getCharCount();
+      String comments = si.getComments();
+      Date createDateTime = si.getCreateDateTime();
+      long editTime = si.getEditTime();
+      String keywords = si.getKeywords();
+      String lastAuthor = si.getLastAuthor();
+      Date lastPrinted = si.getLastPrinted();
+      Date lastSaveDateTime = si.getLastSaveDateTime();
+      int pageCount = si.getPageCount();
+      String revNumber = si.getRevNumber();
+      int security = si.getSecurity();
+      String subject = si.getSubject();
+      String template = si.getTemplate();
+      int wordCount = si.getWordCount();
+
+      /*Dates are being stored in millis since the epoch to aid
+      localization*/
+      if(title != null)
+        properties.setProperty(Metadata.TITLE, title);
+      if(applicationName != null)
+        properties.setProperty(Metadata.APPLICATION_NAME, applicationName);
+      if(author != null)
+        properties.setProperty(Metadata.AUTHOR, author);
+      if(charCount != 0)
+        properties.setProperty(Metadata.CHARACTER_COUNT, charCount + "");
+      if(comments != null)
+        properties.setProperty(Metadata.COMMENTS, comments);
+      if(createDateTime != null)
+        properties.setProperty(Metadata.DATE,
+                               Metadata.DATE_FORMAT.format(createDateTime));
+      if(editTime != 0)
+        properties.setProperty(Metadata.LAST_MODIFIED, editTime + "");
+      if(keywords != null)
+        properties.setProperty(Metadata.KEYWORDS, keywords);
+      if(lastAuthor != null)
+        properties.setProperty(Metadata.LAST_AUTHOR, lastAuthor);
+      if(lastPrinted != null)
+        properties.setProperty(Metadata.LAST_PRINTED, lastPrinted.getTime() + 
"");
+      if(lastSaveDateTime != null)
+        properties.setProperty(Metadata.LAST_SAVED, lastSaveDateTime.getTime() 
+ "");
+      if(pageCount != 0)
+        properties.setProperty(Metadata.PAGE_COUNT, pageCount + "");
+      if(revNumber != null)
+        properties.setProperty(Metadata.REVISION_NUMBER, revNumber);
+      if(security != 0)
+        properties.setProperty(Metadata.RIGHTS, security + "");
+      if(subject != null)
+        properties.setProperty(Metadata.SUBJECT, subject);
+      if(template != null)
+        properties.setProperty(Metadata.TEMPLATE, template);
+      if(wordCount != 0)
+        properties.setProperty(Metadata.WORD_COUNT, wordCount + "");
+      propertiesBroker.setProperties(properties);
+    }
+    
+}

Propchange: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html?rev=376768&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
 (added)
+++ 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
 Fri Feb 10 09:08:23 2006
@@ -0,0 +1,6 @@
+<html>
+<body>
+<p>An Excel document parsing plugin.</p>
+<p>This package relies on Jakarta <a 
href="http://jakarta.apache.org/poi/index.html";>POI</a>.</p>
+</body>
+</html>

Propchange: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java?rev=376768&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
 Fri Feb 10 09:08:23 2006
@@ -0,0 +1,64 @@
+/*
+ *  TestMSExcelParser.java 
+ *  Based on the Unit Tests for MSWordParser by John Xing
+ */
+package org.apache.nutch.parse.msexcel;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+
+import junit.framework.TestCase;
+
+/** 
+ * Based on Unit tests for MSWordParser by John Xing
+ *
+ * @author Rohit Kulkarni & Ashish Vaidya
+ */
+public class TestMSExcelParser extends TestCase {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data",".");
+  
+  // Make sure sample files are copied to "test.data"
+  
+  private String[] sampleFiles = {"test.xls"};
+
+  private String expectedText = "BitStream test.xls 321654.0 Apache incubator 
1234.0 Doug Cutting 89078.0 CS 599 Search Engines Spring 2005.0 SBC 1234.0 
764893.0 Java NUTCH!! ";
+
+  public TestMSExcelParser(String name) { 
+    super(name); 
+  }
+
+  protected void setUp() {}
+
+  protected void tearDown() {}
+
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parser parser;
+    Parse parse;
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      protocol = ProtocolFactory.getProtocol(urlString);
+      content = protocol.getContent(urlString);
+
+      parser = ParserFactory.getParser(content.getContentType(), urlString);
+      parse = parser.getParse(content);
+
+      assertTrue(parse.getText().equals(expectedText));
+    }
+  }
+
+}

Propchange: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
------------------------------------------------------------------------------
    svn:eol-style = native




-------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc. Do you grep through log files
for problems?  Stop!  Download the new AJAX search engine that makes
searching your log files as easy as surfing the  web.  DOWNLOAD SPLUNK!
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=103432&bid=230486&dat=121642
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to