Author: ab
Date: Tue Apr 25 12:12:48 2006
New Revision: 396955

URL: http://svn.apache.org/viewcvs?rev=396955&view=rev
Log:
Parser for OpenOffice and OpenDocument formats (an updated version of
NUTCH-125).

Development of this plugin was supported by Zaheed Haque. Thank you!

Added:
    lucene/nutch/trunk/src/plugin/parse-oo/
    lucene/nutch/trunk/src/plugin/parse-oo/build.xml   (with props)
    lucene/nutch/trunk/src/plugin/parse-oo/plugin.xml   (with props)
    lucene/nutch/trunk/src/plugin/parse-oo/sample/
    lucene/nutch/trunk/src/plugin/parse-oo/sample/ootest.odt   (with props)
    lucene/nutch/trunk/src/plugin/parse-oo/sample/ootest.sxw   (with props)
    lucene/nutch/trunk/src/plugin/parse-oo/sample/ootest.txt   (with props)
    lucene/nutch/trunk/src/plugin/parse-oo/src/
    lucene/nutch/trunk/src/plugin/parse-oo/src/java/
    lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/
    lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/apache/nutch/parse/
    lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/
    
lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java
   (with props)
    lucene/nutch/trunk/src/plugin/parse-oo/src/test/
    lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/
    lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/
    lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/
    
lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java
   (with props)
Modified:
    lucene/nutch/trunk/conf/mime-types.xml
    lucene/nutch/trunk/conf/parse-plugins.xml
    lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/conf/mime-types.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/mime-types.xml?rev=396955&r1=396954&r2=396955&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/mime-types.xml (original)
+++ lucene/nutch/trunk/conf/mime-types.xml Tue Apr 25 12:12:48 2006
@@ -82,9 +82,74 @@
         <ext>ppt</ext>
     </mime-type>
 
+    <mime-type name="application/vnd.oasis.opendocument.presentation"
+               description="OpenDocument Presentation">
+        <ext>odp</ext>
+    </mime-type>
+
+    <mime-type name="application/vnd.oasis.opendocument.presentation-template"
+               description="OpenDocument Presentation Template">
+        <ext>otp</ext>
+    </mime-type>
+
+    <mime-type name="application/vnd.oasis.opendocument.spreadsheet"
+               description="OpenDocument Spreadsheet">
+        <ext>ods</ext>
+    </mime-type>
+
+    <mime-type name="application/vnd.oasis.opendocument.spreadsheet-template"
+               description="OpenDocument Spreadsheet Template">
+        <ext>ots</ext>
+    </mime-type>
+
+    <mime-type name="application/vnd.oasis.opendocument.text"
+               description="OpenDocument Text Document">
+        <ext>odt</ext>
+    </mime-type>
+
+    <mime-type name="application/vnd.oasis.opendocument.text-template"
+               description="OpenDocument Text Document Template">
+        <ext>ott</ext>
+    </mime-type>
+
+    <mime-type name="application/vnd.oasis.opendocument.text-master"
+               description="OpenDocument Text Document Master">
+        <ext>odm</ext>
+    </mime-type>
+
+    <mime-type name="application/vnd.oasis.opendocument.text-web"
+               description="OpenDocument Web Text Document">
+        <ext>odh</ext>
+    </mime-type>
+
+    <mime-type name="application/vnd.sun.xml.calc"
+               description="StarOffice Calc Spreadsheet">
+        <ext>sxc</ext>
+    </mime-type>
+
+    <mime-type name="application/vnd.sun.xml.calc.template"
+               description="StarOffice Calc Spreadsheet Template">
+        <ext>stc</ext>
+    </mime-type>
+
+    <mime-type name="application/vnd.sun.xml.impress"
+               description="StarOffice Impress Presentation">
+        <ext>sxi</ext>
+    </mime-type>
+
+    <mime-type name="application/vnd.sun.xml.impress.template"
+               description="StarOffice Impress Presentation Template">
+        <ext>sti</ext>
+    </mime-type>
+
     <mime-type name="application/vnd.sun.xml.writer"
-               description="OpenOffice.org Text Document">
+               description="StarOffice Text Document">
         <ext>sxw</ext>
+    </mime-type>
+
+    <mime-type name="application/vnd.sun.xml.writer.template"
+               description="StarOffice Text Document Template">
+        <ext>stw</ext>
     </mime-type>
 
     <mime-type name="application/vnd.wap.wbxml">

Modified: lucene/nutch/trunk/conf/parse-plugins.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=396955&r1=396954&r2=396955&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/parse-plugins.xml (original)
+++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Apr 25 12:12:48 2006
@@ -57,6 +57,62 @@
                <plugin id="parse-mspowerpoint" />
        </mimeType>
 
+       <mimeType name="application/vnd.oasis.opendocument.text">
+               <plugin id="parse-oo" />
+       </mimeType>
+
+       <mimeType name="application/vnd.oasis.opendocument.text-template">
+               <plugin id="parse-oo" />
+       </mimeType>
+
+       <mimeType name="application/vnd.oasis.opendocument.text-master">
+               <plugin id="parse-oo" />
+       </mimeType>
+
+       <mimeType name="application/vnd.oasis.opendocument.text-web">
+               <plugin id="parse-oo" />
+       </mimeType>
+
+       <mimeType name="application/vnd.oasis.opendocument.presentation">
+               <plugin id="parse-oo" />
+       </mimeType>
+
+       <mimeType 
name="application/vnd.oasis.opendocument.presentation-template">
+               <plugin id="parse-oo" />
+       </mimeType>
+
+       <mimeType name="application/vnd.oasis.opendocument.spreadsheet">
+               <plugin id="parse-oo" />
+       </mimeType>
+
+       <mimeType 
name="application/vnd.oasis.opendocument.spreadsheet-template">
+               <plugin id="parse-oo" />
+       </mimeType>
+
+       <mimeType name="application/vnd.sun.xml.calc">
+               <plugin id="parse-oo" />
+       </mimeType>
+
+       <mimeType name="application/vnd.sun.xml.calc.template">
+               <plugin id="parse-oo" />
+       </mimeType>
+
+       <mimeType name="application/vnd.sun.xml.impress">
+               <plugin id="parse-oo" />
+       </mimeType>
+
+       <mimeType name="application/vnd.sun.xml.impress.template">
+               <plugin id="parse-oo" />
+       </mimeType>
+
+       <mimeType name="application/vnd.sun.xml.writer">
+               <plugin id="parse-oo" />
+       </mimeType>
+
+       <mimeType name="application/vnd.sun.xml.writer.template">
+               <plugin id="parse-oo" />
+       </mimeType>
+
        <mimeType name="application/vnd.wap.wbxml">
                <plugin id="parse-text" />
        </mimeType>

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=396955&r1=396954&r2=396955&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Tue Apr 25 12:12:48 2006
@@ -39,6 +39,7 @@
      <ant dir="parse-msexcel" target="deploy"/>
      <ant dir="parse-mspowerpoint" target="deploy"/>
      <ant dir="parse-msword" target="deploy"/>
+     <ant dir="parse-oo" target="deploy"/>
      <ant dir="parse-pdf" target="deploy"/>
      <ant dir="parse-rss" target="deploy"/>
      <!-- <ant dir="parse-rtf" target="deploy"/> -->
@@ -69,6 +70,7 @@
      <ant dir="parse-msexcel" target="test"/>
      <ant dir="parse-mspowerpoint" target="test"/>
      <ant dir="parse-msword" target="test"/>
+     <ant dir="parse-oo" target="test"/>
      <ant dir="parse-pdf" target="test"/>
      <ant dir="parse-rss" target="test"/>
  <!-- <ant dir="parse-rtf" target="test"/> -->
@@ -113,6 +115,7 @@
     <ant dir="parse-msexcel" target="clean"/>
     <ant dir="parse-mspowerpoint" target="clean"/>
     <ant dir="parse-msword" target="clean"/>
+    <ant dir="parse-oo" target="clean"/>
     <ant dir="parse-pdf" target="clean"/>
     <ant dir="parse-rss" target="clean"/>
     <ant dir="parse-rtf" target="clean"/>

Added: lucene/nutch/trunk/src/plugin/parse-oo/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-oo/build.xml?rev=396955&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-oo/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-oo/build.xml Tue Apr 25 12:12:48 2006
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+
+<project name="parse-oo" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-xml"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-xml/*.jar" />
+    </fileset>
+  </path>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-xml"/>
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>

Propchange: lucene/nutch/trunk/src/plugin/parse-oo/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-oo/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-oo/plugin.xml?rev=396955&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-oo/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-oo/plugin.xml Tue Apr 25 12:12:48 2006
@@ -0,0 +1,91 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-oo"
+   name="OpenOffice/OpenDocument Parse Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="parse-oo.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-xml"/>
+   </requires>
+
+   <extension id="org.apache.nutch.parse.oo"
+              name="OOParse"
+              point="org.apache.nutch.parse.Parser">
+
+      <!-- Text / Writer -->
+      <implementation id="org.apache.nutch.parse.oo.OpenDocument.Text"
+                      class="org.apache.nutch.parse.oo.OOParser"
+                     contentType="application/vnd.oasis.opendocument.text"
+                      pathSuffix="odt"/>
+      <implementation id="org.apache.nutch.parse.oo.StarOffice.Writer"
+                      class="org.apache.nutch.parse.oo.OOParser"
+                     contentType="application/vnd.sun.xml.writer"
+                      pathSuffix="sxw"/>
+      <!-- Text / Writer Template -->
+      <implementation id="org.apache.nutch.parse.oo.OpenDocument.TextTemplate"
+                      class="org.apache.nutch.parse.oo.OOParser"
+                     
contentType="application/vnd.oasis.opendocument.text-template"
+                      pathSuffix="ott"/>
+      <implementation id="org.apache.nutch.parse.oo.StarOffice.WriterTemplate"
+                      class="org.apache.nutch.parse.oo.OOParser"
+                     contentType="application/vnd.sun.xml.writer.template"
+                      pathSuffix="stw"/>
+      <!-- Text-Web -->
+      <implementation id="org.apache.nutch.parse.oo.OpenDocument.TextWeb"
+                      class="org.apache.nutch.parse.oo.OOParser"
+                     contentType="application/vnd.oasis.opendocument.text-web"
+                      pathSuffix="odh"/>
+      <!-- Text-Master -->
+      <implementation id="org.apache.nutch.parse.oo.OpenDocument.TextMaster"
+                      class="org.apache.nutch.parse.oo.OOParser"
+                     
contentType="application/vnd.oasis.opendocument.text-master"
+                      pathSuffix="odm"/>
+      <!-- Spreadsheet / Calc -->
+      <implementation id="org.apache.nutch.parse.oo.OpenDocument.Spreadsheet"
+                      class="org.apache.nutch.parse.oo.OOParser"
+                     
contentType="application/vnd.oasis.opendocument.spreadsheet"
+                      pathSuffix="ods"/>
+      <implementation id="org.apache.nutch.parse.oo.StarOffice.Calc"
+                      class="org.apache.nutch.parse.oo.OOParser"
+                     contentType="application/vnd.sun.xml.calc"
+                      pathSuffix="sxc"/>
+      <!-- Spreadsheet / Calc Template -->
+      <implementation 
id="org.apache.nutch.parse.oo.OpenDocument.SpreadsheetTemplate"
+                      class="org.apache.nutch.parse.oo.OOParser"
+                     
contentType="application/vnd.oasis.opendocument.spreadsheet-template"
+                      pathSuffix="ots"/>
+      <implementation id="org.apache.nutch.parse.oo.StarOffice.CalcTemplate"
+                      class="org.apache.nutch.parse.oo.OOParser"
+                     contentType="application/vnd.sun.xml.calc.template"
+                      pathSuffix="stc"/>
+      <!-- Presentation / Impress -->
+      <implementation id="org.apache.nutch.parse.oo.OpenDocument.Presentation"
+                      class="org.apache.nutch.parse.oo.OOParser"
+                     
contentType="application/vnd.oasis.opendocument.presentation"
+                      pathSuffix="odp"/>
+      <implementation id="org.apache.nutch.parse.oo.StarOffice.Impress"
+                      class="org.apache.nutch.parse.oo.OOParser"
+                     contentType="application/vnd.sun.xml.impress"
+                      pathSuffix="sxi"/>
+      <!-- Presentation / Impress Template -->
+      <implementation 
id="org.apache.nutch.parse.oo.OpenDocument.PresentationTemplate"
+                      class="org.apache.nutch.parse.oo.OOParser"
+                     
contentType="application/vnd.oasis.opendocument.presentation-template"
+                      pathSuffix="otp"/>
+      <implementation id="org.apache.nutch.parse.oo.StarOffice.ImpressTemplate"
+                      class="org.apache.nutch.parse.oo.OOParser"
+                     contentType="application/vnd.sun.xml.impress.template"
+                      pathSuffix="sti"/>
+
+   </extension>
+
+</plugin>

Propchange: lucene/nutch/trunk/src/plugin/parse-oo/plugin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-oo/sample/ootest.odt
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-oo/sample/ootest.odt?rev=396955&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-oo/sample/ootest.odt
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-oo/sample/ootest.sxw
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-oo/sample/ootest.sxw?rev=396955&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-oo/sample/ootest.sxw
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-oo/sample/ootest.txt
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-oo/sample/ootest.txt?rev=396955&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-oo/sample/ootest.txt (added)
+++ lucene/nutch/trunk/src/plugin/parse-oo/sample/ootest.txt Tue Apr 25 
12:12:48 2006
@@ -0,0 +1,25 @@
+Abcedfg ????? Lorem ipsum dolor sit amet, consectetuer adipiscing elit. 
Integer a leo in lacus malesuada ornare. Mauris sagittis. Nam vestibulum. Nunc 
gravida  vestibulum augue. Praesent sed lectus quis lectus adipiscing bibendum. 
 Sed nulla.  Duis posuere  justo eget urna.  Proin lorem orci, vestibulum ut, 
consequat molestie, eleifend a, nibh. Mauris sed lacus.  Etiam blandit 
tincidunt neque. Cras ac sapien. Duis erat.  http://www.openoffice.org Lorem 
ipsum dolor sit amet, consectetuer adipiscing elit. Integer a leo in lacus 
malesuada ornare.  Mauris sagittis.  Nam vestibulum. Nunc gravida  vestibulum 
augue. Praesent sed lectus quis lectus adipiscing bibendum.  Sed nulla.  Duis 
posuere  justo eget urna.  Proin lorem orci , vestibulum ut, consequat 
molestie, eleifend a, nibh. Mauris sed lacus.  Etiam blandit tincidunt neque. 
Cras ac sapien. Duis erat.  Vivamus fringilla velit sit amet leo imperdiet 
sagittis. Maecenas porttitor. Nulla eget nibh. Nulla accumsan. Donec nec mi. 
 Etiam dui justo, volutpat a, volutpat id, ornare sed, erat. Vivamus urna eros, 
nonummy id, vestibulum ut, elementum in, diam. Etiam eros nibh, porta vel, 
dignissim non, interdum eu, lorem.  Curabitur lorem. Nulla pellentesque congue 
nisl.  Duis id mauris et mauris varius hendrerit. Nunc sem. Nam id sem ut 
tortor sollicitudin bibendum. Mauris placerat, pede ut vestibulum venenatis, 
eros metus semper lacus, a fermentum felis massa sit amet lectus. Cras tellus 
eros, molestie sit amet, egestas varius, porttitor eget, odio.  Nunc vitae 
augue luctus eros fringilla convallis. Proin tincidunt neque ut dui. Morbi at 
nisl. Nunc neque. Cras a sem et ligula semper eleifend.  
ĄćęłńóśźżĄĆĘŁŃÓŚŹŻ ÄöåÄÖä \’”<>& &amp;
+Abcdefg
+Abcdefg
+abcdefg
+Akuku
+Akuku
+Akuku
+akuku
+Akuku
+Akuku
+Akuku
+akuku
+Title
+Col1
+Col2
+Col3
+head
+Cell1
+Cell2
+Cel3
+total
+TOTAL
+TOTAL
+TOTAL
+                                http://www.openoffice.org
\ No newline at end of file

Propchange: lucene/nutch/trunk/src/plugin/parse-oo/sample/ootest.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java?rev=396955&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java
 Tue Apr 25 12:12:48 2006
@@ -0,0 +1,218 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.oo;
+
+import java.io.*;
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+import java.util.zip.*;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.jaxen.*;
+import org.jaxen.jdom.JDOMXPath;
+import org.jdom.*;
+import org.jdom.input.*;
+
+/**
+ * Parser for OpenOffice and OpenDocument formats. This should handle
+ * the following formats: Text, Spreadsheet, Presentation, and
+ * corresponding templates and "master" documents.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class OOParser implements Parser {
+  public static final Logger LOG =
+    LogFormatter.getLogger(OOParser.class.getName());
+  
+  private Configuration conf;
+
+  public OOParser () {
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+  
+  public Configuration getConf() {
+    return conf;
+  }
+  
+  public Parse getParse(Content content) {
+    String text = null;
+    String title = null;
+    Metadata metadata = new Metadata();
+    ArrayList outlinks = new ArrayList();
+
+    try {
+      byte[] raw = content.getContent();
+      String contentLength = content.getMetadata().get("Content-Length");
+      if (contentLength != null
+            && raw.length != Integer.parseInt(contentLength)) {
+          return new ParseStatus(ParseStatus.FAILED, 
ParseStatus.FAILED_TRUNCATED,
+                  "Content truncated at "+raw.length
+            +" bytes. Parser can't handle incomplete 
files.").getEmptyParse(conf);
+      }
+      ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(raw));
+      ZipEntry ze = null;
+      while ((ze = zis.getNextEntry()) != null) {
+        if (ze.getName().equals("content.xml")) {
+          text = parseContent(ze, zis, outlinks);
+        } else if (ze.getName().equals("meta.xml")) {
+          parseMeta(ze, zis, metadata);
+        }
+      }
+      zis.close();
+    } catch (Exception e) { // run time exception
+      e.printStackTrace();
+      return new ParseStatus(ParseStatus.FAILED,
+              "Can't be handled as OO document. " + e).getEmptyParse(conf);
+    }
+
+    title = metadata.get(Metadata.TITLE);
+    if (text == null)
+      text = "";
+
+    if (title == null)
+      title = "";
+
+    Outlink[] links = (Outlink[])outlinks.toArray(new 
Outlink[outlinks.size()]);
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, 
links, metadata);
+    return new ParseImpl(text, parseData);
+  }
+  
+  // extract as much plain text as possible.
+  private String parseContent(ZipEntry ze, ZipInputStream zis, ArrayList 
outlinks) throws Exception {
+    StringBuffer res = new StringBuffer();
+    FilterInputStream fis = new FilterInputStream(zis) {
+      public void close() {};
+    };
+    SAXBuilder builder = new SAXBuilder();
+    Document doc = builder.build(fis);
+    Element root = doc.getRootElement();
+    // XXX this is expensive for very large documents. In those cases another
+    // XXX method (direct processing of SAX events, or XMLPull) should be used.
+    XPath path = new JDOMXPath("//text:span | //text:p | //text:tab | 
//text:tab-stop | //text:a");
+    path.addNamespace("text", root.getNamespace("text").getURI());
+    Namespace xlink = Namespace.getNamespace("xlink", 
"http://www.w3.org/1999/xlink";);
+    List list = path.selectNodes(doc);
+    boolean lastp = true;
+    for (int i = 0; i < list.size(); i++) {
+      Element el = (Element)list.get(i);
+      String text = el.getText();
+      if (el.getName().equals("p")) {
+        // skip empty paragraphs
+        if (!text.equals("")) {
+          if (!lastp) res.append("\n");
+          res.append(text + "\n");
+          lastp = true;
+        }
+      } else if (el.getName().startsWith("tab")) {
+        res.append("\t");
+        lastp = false;
+      } else if (el.getName().equals("a")) {
+        List nl = el.getChildren();
+        String a = null;
+        for (int k = 0; k < nl.size(); k++) {
+          Element anchor = (Element)nl.get(k);
+          String nsName = anchor.getNamespacePrefix() + ":" + anchor.getName();
+          if (!nsName.equals("text:span")) continue;
+          a = anchor.getText();
+          break;
+        }
+        String u = el.getAttributeValue("href", xlink);
+        if (u == null) u = a; // often anchors are URLs
+        try {
+          Outlink o = new Outlink(u, a, conf);
+          outlinks.add(o);
+        } catch (MalformedURLException mue) {
+          // skip
+        }
+        if (a != null && !a.equals("")) {
+          if (!lastp) res.append(' ');
+          res.append(a);
+          lastp = false;
+        }
+      } else {
+        if (!text.equals("")) {
+          if (!lastp) res.append(' ');
+          res.append(text);
+        }
+        lastp = false;
+      }
+    }
+    return res.toString();
+  }
+  
+  // extract metadata and convert them to Nutch format
+  private void parseMeta(ZipEntry ze, ZipInputStream zis, Metadata metadata) 
throws Exception {
+    FilterInputStream fis = new FilterInputStream(zis) {
+      public void close() {};
+    };
+    SAXBuilder builder = new SAXBuilder();
+    Document doc = builder.build(fis);
+    XPath path = new JDOMXPath("/office:document-meta/office:meta/*");
+    Element root = doc.getRootElement();
+    path.addNamespace("office", root.getNamespace("office").getURI());
+    List list = path.selectNodes(doc);
+    for (int i = 0; i < list.size(); i++) {
+      Element n = (Element)list.get(i);
+      String text = n.getText();
+      if (text.trim().equals("")) continue;
+      String name = n.getName();
+      if (name.equals("title"))
+        metadata.add(Metadata.TITLE, text);
+      else if (name.equals("language"))
+        metadata.add(Metadata.LANGUAGE, text);
+      else if (name.equals("creation-date"))
+        metadata.add(Metadata.DATE, text);
+      else if (name.equals("print-date"))
+        metadata.add(Metadata.LAST_PRINTED, text);
+      else if (name.equals("generator"))
+        metadata.add(Metadata.APPLICATION_NAME, text);
+      else if (name.equals("creator"))
+        metadata.add(Metadata.CREATOR, text);
+    }
+  }
+  
+  public static void main(String[] args) throws Exception {
+    OOParser oo = new OOParser();
+    Configuration conf = NutchConfiguration.create();
+    oo.setConf(conf);
+    FileInputStream fis = new FileInputStream(args[0]);
+    byte[] bytes = new byte[fis.available()];
+    fis.read(bytes);
+    fis.close();
+    Content c = new Content("local", "local", bytes, 
"application/vnd.oasis.opendocument.text", new Metadata(), conf);
+    Parse p = oo.getParse(c);
+    System.out.println(p.getData());
+    System.out.println("Text: '" + p.getText() + "'");
+    /*
+    // create the test output file
+    OutputStreamWriter osw = new OutputStreamWriter(new 
FileOutputStream("e:\\ootest.txt"), "UTF-8");
+    osw.write(p.getText());
+    osw.flush();
+    osw.close();
+    */
+  }
+}
\ No newline at end of file

Propchange: 
lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java?rev=396955&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java
 Tue Apr 25 12:12:48 2006
@@ -0,0 +1,99 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.oo;
+
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.UTF8;
+import org.apache.nutch.protocol.*;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+/** 
+ * Unit tests for OOParser.
+ *
+ * @author Andrzej Bialecki
+ */
+public class TestOOParser extends TestCase {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data",".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-oo/build.xml during plugin compilation.
+  private String[] sampleFiles = {"ootest.odt", "ootest.sxw"};
+
+  private String sampleText = "ootest.txt";
+  
+  private String expectedText;
+
+  public TestOOParser(String name) { 
+    super(name);
+    try {
+      // read the test string
+      FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + 
sampleText);
+      StringBuffer sb = new StringBuffer();
+      int len = 0;
+      InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
+      char[] buf = new char[1024];
+      while ((len = isr.read(buf)) > 0) {
+        sb.append(buf, 0, len);
+      }
+      isr.close();
+      expectedText = sb.toString();
+      // normalize space
+      expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+  protected void setUp() {}
+
+  protected void tearDown() {}
+
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Content content;
+    Parse parse;
+    Configuration conf = NutchConfiguration.create();
+    Protocol protocol;
+    ProtocolFactory factory = new ProtocolFactory(conf);
+    OOParser parser = new OOParser();
+    parser.setConf(conf);
+
+    for (int i=0; i<sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      protocol = factory.getProtocol(urlString);
+      content = protocol.getProtocolOutput(new UTF8(urlString), new 
CrawlDatum()).getContent();
+
+      parse = parser.getParse(content);
+
+      String text = parse.getText().replaceAll("[ \t\r\n]+", " ");
+      assertTrue(expectedText.equals(text));
+    }
+  }
+
+}

Propchange: 
lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to