s...

jerome Mon, 13 Feb 2006 13:27:07 -0800

Author: jerome
Date: Mon Feb 13 13:26:15 2006
New Revision: 377493

URL: http://svn.apache.org/viewcvs?rev=377493&view=rev
Log:
Add a mini framework for microsoft documents parsing


Added:
    lucene/nutch/trunk/src/plugin/lib-parsems/
    lucene/nutch/trunk/src/plugin/lib-parsems/build.xml   (with props)
    lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml   (with props)
    lucene/nutch/trunk/src/plugin/lib-parsems/src/
    lucene/nutch/trunk/src/plugin/lib-parsems/src/java/
    lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/
    lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/
    
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/
    
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
   (with props)
    
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java
   (with props)
Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java
    lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java?rev=377493&r1=377492&r2=377493&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java Mon Feb 
13 13:26:15 2006
@@ -149,6 +149,14 @@
 
   }
 
+  /**
+   * Checks if a string is empty (ie is null or empty).
+   */
+  public static boolean isEmpty(String str) {
+    return (str == null) || (str.equals(""));
+  }
+  
+  
   private static HashMap encodingAliases = new HashMap();
 
   /** 

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=377493&r1=377492&r2=377493&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Mon Feb 13 13:26:15 2006
@@ -14,6 +14,7 @@
      <ant dir="lib-http" target="deploy"/>
      <ant dir="lib-jakarta-poi" target="deploy"/>
      <ant dir="lib-lucene-analyzers" target="deploy"/>
+     <ant dir="lib-parsems" target="deploy"/>
      <ant dir="nutch-extensionpoints" target="deploy"/>
      <ant dir="ontology" target="deploy"/>
      <ant dir="protocol-file" target="deploy"/>
@@ -78,6 +79,7 @@
     <ant dir="lib-http" target="clean"/>
     <ant dir="lib-jakarta-poi" target="clean"/>
     <ant dir="lib-lucene-analyzers" target="clean"/>
+    <ant dir="lib-parsems" target="clean"/>
     <ant dir="nutch-extensionpoints" target="clean"/>
     <ant dir="ontology" target="clean"/>
     <ant dir="protocol-file" target="clean"/>

Added: lucene/nutch/trunk/src/plugin/lib-parsems/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/build.xml?rev=377493&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-parsems/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/lib-parsems/build.xml Mon Feb 13 13:26:15 2006
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+
+<project name="lib-parsems" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+  <path id="plugin.deps">
+    <fileset dir="../lib-jakarta-poi/lib">
+      <include name="*.jar" />
+    </fileset>
+  </path>
+
+</project>

Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml?rev=377493&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml Mon Feb 13 13:26:15 
2006
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ ! A common framework for microsoft documents parsers implementations
+ !-->
+<plugin
+   id="lib-parsems"
+   name="Parse MS Documents Framework"
+   version="1.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+     <library name="lib-parsems.jar">
+        <export name="*"/>
+     </library>
+   </runtime>
+
+   <requires>
+      <import plugin="lib-jakarta-poi"/>
+   </requires>
+
+</plugin>

Propchange: lucene/nutch/trunk/src/plugin/lib-parsems/plugin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java?rev=377493&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
 Mon Feb 13 13:26:15 2006
@@ -0,0 +1,161 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.ms;
+
+// JDK imports
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+
+
+/**
+ * A generic Microsoft document parser.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class MSBaseParser implements Parser {
+  
+  private Configuration conf;
+  
+  protected static final Logger LOG =
+          LogFormatter.getLogger(MSBaseParser.class.getName());
+
+
+  /**
+   * Parses a Content with a specific [EMAIL PROTECTED] MSExtractor Microsoft 
document
+   * extractor.
+   */
+  protected Parse getParse(MSExtractor extractor, Content content) {
+    
+    String text = null;
+    String title = null;
+    Outlink[] outlinks = null;
+    Properties properties = null;
+    
+    try {
+      byte[] raw = content.getContent();
+      String contentLength = 
content.getMetadata().get(Metadata.CONTENT_LENGTH);
+      if ((contentLength != null) &&
+          (raw.length != Integer.parseInt(contentLength))) {
+        return new ParseStatus(ParseStatus.FAILED,
+                               ParseStatus.FAILED_TRUNCATED,
+                               "Content truncated at " + raw.length +" bytes. 
" +
+                               "Parser can't handle incomplete file.")
+                               .getEmptyParse(this.conf);
+      }
+      extractor.extract(new ByteArrayInputStream(raw));
+      text = extractor.getText();
+      properties = extractor.getProperties();
+      outlinks = OutlinkExtractor.getOutlinks(text, content.getUrl(), 
getConf());
+      
+    } catch (Exception e) {
+      return new ParseStatus(ParseStatus.FAILED,
+                             "Can't be handled as micrsosoft document. " + e)
+                             .getEmptyParse(this.conf);
+    }
+    
+    // collect meta data
+    Metadata metadata = new Metadata();
+    title = properties.getProperty(DublinCore.TITLE);
+    properties.remove(DublinCore.TITLE);
+    metadata.setAll(properties);
+
+    if (text == null) { text = ""; }
+    if (title == null) { title = ""; }
+
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+                                        outlinks, content.getMetadata(),
+                                        metadata);
+    parseData.setConf(this.conf);
+    return new ParseImpl(text, parseData);
+  }
+
+  
+  /**
+   * Main for testing. Pass a ms document as argument
+   */
+  public static void main(String mime, MSBaseParser parser, String args[]) {
+    if (args.length < 1) {
+      System.err.println("Usage:");
+      System.err.println("\t" + parser.getClass().getName() + " <file>");
+      System.exit(1);
+    }
+
+    String file = args[0];
+    byte[] raw = getRawBytes(new File(file));
+
+    Metadata meta = new Metadata();
+    meta.set(Response.CONTENT_LENGTH, "" + raw.length);
+    Content content = new Content(file, file, raw, mime, meta,
+                                  NutchConfiguration.create());
+
+    System.out.println(parser.getParse(content).getText());
+  }
+
+  private final static byte[] getRawBytes(File f) {
+    try {
+      if (!f.exists())
+        return null;
+      FileInputStream fin = new FileInputStream(f);
+      byte[] buffer = new byte[(int) f.length()];
+      fin.read(buffer);
+      fin.close();
+      return buffer;
+    } catch (Exception err) {
+      err.printStackTrace();
+      return null;
+    }
+
+  }
+  
+
+  /* ---------------------------- *
+   * <implemenation:Configurable> *
+   * ---------------------------- */
+  
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /* ----------------------------- *
+   * </implemenation:Configurable> *
+   * ----------------------------- */
+
+}

Propchange: 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java?rev=377493&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java
 Mon Feb 13 13:26:15 2006
@@ -0,0 +1,199 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.ms;
+
+// JDK imports
+import java.io.InputStream;
+import java.util.Date;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+// Hadoop imports
+import org.apache.hadoop.util.LogFormatter;
+
+// Nutch imports
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.HttpHeaders;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Office;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.util.StringUtil;
+
+// Jakarta POI imports
+import org.apache.poi.hpsf.PropertySetFactory;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
+
+
+/**
+ * Defines a Microsoft document content extractor.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class MSExtractor {
+  
+  protected final static Logger LOG = 
+          LogFormatter.getLogger(MSExtractor.class.getName());
+
+  private String text = null;
+  private POIFSReader reader = null;
+  private PropertiesBroker properties = null;
+  
+
+  /** Constructs a new Microsoft document extractor. */
+  protected MSExtractor() { }
+
+  
+  /**
+   * Extracts properties and text from an MS Document input stream
+   */
+  protected void extract(InputStream input) throws Exception {
+    // First, extract properties
+    this.reader = new POIFSReader();
+    this.properties = new PropertiesBroker();
+    this.reader.registerListener(
+            new PropertiesReaderListener(this.properties),
+            SummaryInformation.DEFAULT_STREAM_NAME);
+    input.reset();
+    if (input.available() > 0) {
+      reader.read(input);
+    }
+    // Then, extract text
+    input.reset();
+    this.text = extractText(input);
+  }
+
+  /**
+   * Extracts the text content from a Microsoft document input stream.
+   */
+  protected abstract String extractText(InputStream input) throws Exception;
+  
+  
+  /**
+   * Get the content text of the Microsoft document.
+   * @return the content text of the document
+   */
+  protected String getText() {
+    return this.text;
+  }
+  
+
+  /**
+   * Get the <code>Properties</code> of the Microsoft document.
+   * @return the properties of the document
+   */
+  protected Properties getProperties() {
+    return properties.getProperties();
+  }
+
+  
+  private final static class PropertiesBroker {
+
+    private final static int TIMEOUT = 2 * 1000;
+    private Properties properties = null;
+
+    public synchronized Properties getProperties() {
+
+      final long start = new Date().getTime();
+      long now = start;
+
+      while (this.properties == null && now - start < TIMEOUT) {
+        try {
+          wait(TIMEOUT / 10);
+        } catch (InterruptedException e) {
+        }
+        now = new Date().getTime();
+      }
+      notifyAll();
+      return this.properties;
+    }
+
+    public synchronized void setProperties(Properties properties) {
+      this.properties = properties;
+      notifyAll();
+    }
+  }
+  
+  
+  private class PropertiesReaderListener implements POIFSReaderListener {
+    
+    private PropertiesBroker propertiesBroker;
+    private Properties metadata = new Properties();
+    
+    PropertiesReaderListener(PropertiesBroker propertiesBroker) {
+      this.propertiesBroker = propertiesBroker;
+    }
+    
+    public void processPOIFSReaderEvent(POIFSReaderEvent event) {
+      if (!event.getName().startsWith(SummaryInformation.DEFAULT_STREAM_NAME)) 
{
+        return;
+      }
+      
+      try {
+        SummaryInformation si = (SummaryInformation)
+                                  PropertySetFactory.create(event.getStream());
+        setProperty(DublinCore.TITLE, si.getTitle());
+        setProperty(Office.APPLICATION_NAME, si.getApplicationName());
+        setProperty(Office.AUTHOR, si.getAuthor());
+        setProperty(Office.CHARACTER_COUNT, si.getCharCount());
+        setProperty(Office.COMMENTS, si.getComments());
+        setProperty(DublinCore.DATE, si.getCreateDateTime());
+//        setProperty(Office.EDIT_TIME, si.getEditTime());
+        setProperty(HttpHeaders.LAST_MODIFIED, si.getLastSaveDateTime());
+        setProperty(Office.KEYWORDS, si.getKeywords());
+        setProperty(Office.LAST_AUTHOR, si.getLastAuthor());
+        setProperty(Office.LAST_PRINTED, si.getLastPrinted());
+        setProperty(Office.LAST_SAVED, si.getLastSaveDateTime());
+        setProperty(Office.PAGE_COUNT, si.getPageCount());
+        setProperty(Office.REVISION_NUMBER, si.getRevNumber());
+        setProperty(DublinCore.RIGHTS, si.getSecurity());
+        setProperty(DublinCore.SUBJECT, si.getSubject());
+        setProperty(Office.TEMPLATE, si.getTemplate());
+        setProperty(Office.WORD_COUNT, si.getWordCount());
+      } catch (Exception ex) {
+      }
+      propertiesBroker.setProperties(metadata);
+    }
+    
+    private final void setProperty(String name, String value) {
+      if (!StringUtil.isEmpty(name) && !StringUtil.isEmpty(value)) {
+        metadata.setProperty(name, value);
+      }
+    }
+
+    private final void setProperty(String name, int value) {
+      if (value != 0) {
+        setProperty(name, String.valueOf(value));
+      }
+    }
+
+    private final void setProperty(String name, long value) {
+      if (value != 0) {
+        setProperty(name, String.valueOf(value));
+      }
+    }
+
+    private final void setProperty(String name, Date date) {
+      if (date != null) {
+        setProperty(name, HttpDateFormat.toString(date));
+      }
+    }
+
+  }
+  
+}

Propchange: 
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native




-------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc. Do you grep through log files
for problems?  Stop!  Download the new AJAX search engine that makes
searching your log files as easy as surfing the  web.  DOWNLOAD SPLUNK!
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=103432&bid=230486&dat=121642
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

[Nutch-cvs] svn commit: r377493 - in /lucene/nutch/trunk/src: java/org/apache/nutch/util/ plugin/ plugin/lib-parsems/ plugin/lib-parsems/src/ plugin/lib-parsems/src/java/ plugin/lib-parsems/src/java/org/ plugin/lib-parsems/src/java/org/apache/ plugin/lib-parsems/s...

Reply via email to