Author: jerome
Date: Sun Sep  4 13:53:49 2005
New Revision: 278626

URL: http://svn.apache.org/viewcvs?rev=278626&view=rev
Log:
NUTCH-53, Parser plugin for Zip files (Rohit Kulkarni)

Added:
    lucene/nutch/trunk/src/plugin/parse-zip/
    lucene/nutch/trunk/src/plugin/parse-zip/build.xml   (with props)
    lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml   (with props)
    lucene/nutch/trunk/src/plugin/parse-zip/sample/
    lucene/nutch/trunk/src/plugin/parse-zip/sample/test.zip   (with props)
    lucene/nutch/trunk/src/plugin/parse-zip/src/
    lucene/nutch/trunk/src/plugin/parse-zip/src/java/
    lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/
    lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/
    lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/
    
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
   (with props)
    
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
   (with props)
    lucene/nutch/trunk/src/plugin/parse-zip/src/test/
    lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/
    lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/
    lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/
    
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
   (with props)
Modified:
    lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=278626&r1=278625&r2=278626&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Sun Sep  4 13:53:49 2005
@@ -21,6 +21,7 @@
 <!-- <ant dir="parse-mp3" target="deploy"/> -->
 <!-- <ant dir="parse-rtf" target="deploy"/> -->
      <ant dir="parse-ext" target="deploy"/>
+     <ant dir="parse-zip" target="deploy"/>
      <ant dir="index-basic" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
      <ant dir="query-basic" target="deploy"/>
@@ -48,6 +49,7 @@
  <!-- <ant dir="parse-mp3" target="test"/> -->
  <!-- <ant dir="parse-rtf" target="test"/> -->
      <ant dir="parse-ext" target="test"/>
+     <ant dir="parse-zip" target="test"/>
      <ant dir="creativecommons" target="test"/>
      <ant dir="languageidentifier" target="test"/>
      <ant dir="ontology" target="test"/>
@@ -72,6 +74,7 @@
     <ant dir="parse-mp3" target="clean"/>
     <ant dir="parse-rtf" target="clean"/>
     <ant dir="parse-ext" target="clean"/>
+    <ant dir="parse-zip" target="clean"/>
     <ant dir="index-basic" target="clean"/>
     <ant dir="index-more" target="clean"/>
     <ant dir="query-basic" target="clean"/>

Added: lucene/nutch/trunk/src/plugin/parse-zip/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/build.xml?rev=278626&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-zip/build.xml Sun Sep  4 13:53:49 2005
@@ -0,0 +1,15 @@
+<?xml version="1.0"?>
+
+<project name="parse-zip" default="jar">
+
+  <import file="../build-plugin.xml"/>
+  
+       <!-- for junit test -->
+       <mkdir dir="${build.test}/data" />
+       <copy todir="${build.test}/data">
+               <fileset dir="sample">
+                       <include name="*.zip" />
+               </fileset>
+       </copy>
+
+</project>

Propchange: lucene/nutch/trunk/src/plugin/parse-zip/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml?rev=278626&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml Sun Sep  4 13:53:49 2005
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-zip"
+   name="Zip Parse Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-zip.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.parse.zip"
+              name="ZipParser" 
+              point="org.apache.nutch.parse.Parser">
+
+      <implementation id="org.apache.nutch.parse.zip.ZipParser" 
+                      class="org.apache.nutch.parse.zip.ZipParser" 
+                      contentType="application/zip"
+                      pathSuffix="zip"/>
+   </extension>
+
+</plugin>

Propchange: lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-zip/sample/test.zip
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/sample/test.zip?rev=278626&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-zip/sample/test.zip
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: 
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=278626&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
 Sun Sep  4 13:53:49 2005
@@ -0,0 +1,101 @@
+/*
+ * ZipParser.java
+ *
+ * Nutch parse plugin for zip files - Content Type : application/zip
+ */
+
+package org.apache.nutch.parse.zip;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.util.Properties;
+import java.util.logging.Logger;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.LogFormatter;
+
+/**
+ *
+ * @author Rohit Kulkarni & Ashish Vaidya
+ * ZipParser class based on MSPowerPointParser class by Stephan Strittmatter
+ */
+public class ZipParser implements Parser{
+    
+    private static final Logger LOG = 
LogFormatter.getLogger(ZipParser.class.getName());
+    /** Creates a new instance of ZipParser */
+    public ZipParser() {
+    }
+    
+    public Parse getParse(final Content content) {
+        
+        // check that contentType is one we can handle
+        final String contentType = content.getContentType();
+        if (contentType != null && !contentType.startsWith("application/zip")) 
{
+            return new ParseStatus(ParseStatus.FAILED, 
ParseStatus.FAILED_INVALID_FORMAT,
+              "Content-Type not application/zip: " + 
contentType).getEmptyParse();
+        }
+        
+        String resultText = null;
+        String resultTitle = null;
+        Outlink[] outlinks = null;
+        List outLinksList = new ArrayList();
+       Properties properties = null;
+        
+        try {
+            final String contentLen = content.get("Content-Length");
+            final int len = Integer.parseInt(contentLen);
+            System.out.println("ziplen: " + len);
+            final byte[] contentInBytes = content.getContent();
+            final ByteArrayInputStream bainput = new 
ByteArrayInputStream(contentInBytes);
+            final InputStream input = bainput;
+            
+            if (contentLen != null && contentInBytes.length != len) {
+                return new ParseStatus(ParseStatus.FAILED,
+                                       ParseStatus.FAILED_TRUNCATED,
+                                       "Content truncated at " + 
contentInBytes.length +
+                                       " bytes. Parser can't handle incomplete 
pdf file.").getEmptyParse();
+            }
+            
+            ZipTextExtractor extractor = new ZipTextExtractor();
+            
+            // extract text
+            resultText = extractor.extractText(new 
ByteArrayInputStream(contentInBytes),
+                                       content.getUrl(), outLinksList);
+            
+        } catch (Exception e) {
+            return new ParseStatus(ParseStatus.FAILED,
+                                   "Can't be handled as Zip document. " + 
e).getEmptyParse();
+        }
+        
+        // collect meta data
+        final Properties metadata = new Properties();
+        metadata.putAll(content.getMetadata()); // copy through
+        
+        if (resultText == null) {
+            resultText = "";
+        }
+        
+        if (resultTitle == null) {
+            resultTitle = "";
+        }
+       
+        outlinks = (Outlink[])outLinksList.toArray(new Outlink[0]);
+        final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+                                                  resultTitle, 
+                                                  outlinks, 
+                                                  metadata);
+        
+        LOG.finest("Zip file parsed sucessfully !!");
+        return new ParseImpl(resultText, parseData);
+    }
+    
+}

Propchange: 
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=278626&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
 Sun Sep  4 13:53:49 2005
@@ -0,0 +1,119 @@
+/*
+ * ZipTextExtractor.java
+ *
+ *
+ */
+
+package org.apache.nutch.parse.zip;
+
+import java.util.logging.Logger;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+import java.net.URL;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.LogFormatter;
+
+/**
+ *
+ * @author Rohit Kulkarni & Ashish Vaidya
+ */
+public class ZipTextExtractor {
+       public static final Logger LOG = 
LogFormatter.getLogger(ZipTextExtractor.class.getName());
+    
+    /** Creates a new instance of ZipTextExtractor */
+    public ZipTextExtractor() {
+    }
+    
+    public String extractText(InputStream input, String url, List 
outLinksList) throws IOException {
+        String resultText = "";
+       byte temp;
+        
+        ZipInputStream zin = new ZipInputStream(input);
+        
+        ZipEntry entry;
+        
+        while ((entry = zin.getNextEntry()) != null) {
+            
+            if (!entry.isDirectory()) {
+               int size = (int) entry.getSize();
+                byte[] b = new byte[size];
+                for(int x = 0; x < size; x++) {
+                       int err = zin.read();
+                       if(err != -1) {
+                               b[x] = (byte)err;
+                       } 
+               }
+               String newurl = url + "/";
+                String fname = entry.getName();
+               newurl += fname;
+               URL aURL = new URL(newurl);
+               String base = aURL.toString();
+                int i = fname.lastIndexOf('.');
+                if (i != -1) {
+                    // file name has extension
+                    String contentType = "";
+                    String ext = fname.substring(i + 1, fname.length());
+                    if (ext.equalsIgnoreCase("txt") || 
ext.equalsIgnoreCase("c")
+                    || ext.equalsIgnoreCase("cc") || ext.equalsIgnoreCase("pl")
+                    || ext.equalsIgnoreCase("sh") || 
ext.equalsIgnoreCase("java")
+                    || ext.equalsIgnoreCase("cpp")) {
+                        contentType = "text/plain";
+                    } else if (ext.equalsIgnoreCase("html") || 
ext.equalsIgnoreCase("htm")) {
+                        contentType = "text/html";
+                    } else if (ext.equalsIgnoreCase("xls") || 
ext.equalsIgnoreCase("xla")
+                    || ext.equalsIgnoreCase("xlt") || 
ext.equalsIgnoreCase("xlw")) {
+                        contentType = "application/vnd.ms-excel";
+                    } else if (ext.equalsIgnoreCase("ppt") || 
ext.equalsIgnoreCase("pps")) {
+                        contentType = "application/vnd.ms-powerpoint";
+                    } else if (ext.equalsIgnoreCase("doc")) {
+                        contentType = "application/msword";
+                    } else if (ext.equalsIgnoreCase("mp3")) {
+                        contentType = "audio/mpeg";
+                    } else if (ext.equalsIgnoreCase("pdf")) {
+                        contentType = "application/pdf";
+                    } else if (ext.equalsIgnoreCase("rtf")) {
+                        contentType = "application/rtf";
+                    } else if (ext.equalsIgnoreCase("zip")) {
+                        contentType = "application/zip";
+                    }
+                   System.out.println("trying to parse " + fname);
+                   try {
+                       Properties metadata = new Properties();
+                       metadata.setProperty("Content-Length", 
Long.toString(entry.getSize()));
+                       metadata.setProperty("Content-Type", contentType);
+                       Content content = new Content(newurl, base, b, 
contentType, metadata);
+                       Parser parser = ParserFactory.getParser(contentType, 
newurl);
+                       Parse parse = parser.getParse(content);
+                       ParseData theParseData = parse.getData();
+                               Outlink[] theOutlinks = 
theParseData.getOutlinks();
+                       
+                       for(int count = 0; count < theOutlinks.length; count++) 
{
+                               outLinksList.add(new 
Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
+                       }
+                       
+                       resultText += entry.getName() + " " + parse.getText() + 
" ";
+                   } catch (ParseException e) {
+        
+                       LOG.info("fetch okay, but can't parse " + fname + ", 
reason: " + e.getMessage());
+                   }
+                }
+            }
+        }
+        
+       return resultText;
+    }
+    
+}
+

Propchange: 
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?rev=278626&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
 Sun Sep  4 13:53:49 2005
@@ -0,0 +1,63 @@
+/*
+ * TestZipParser.java
+ */
+
+package org.apache.nutch.parse.zip;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+
+import junit.framework.TestCase;
+
+/** 
+ * Based on Unit tests for MSWordParser by John Xing
+ *
+ * @author Rohit Kulkarni & Ashish Vaidya
+ */
+public class TestZipParser extends TestCase {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data",".");
+  
+  // Make sure sample files are copied to "test.data"
+  
+  private String[] sampleFiles = {"test.zip"};
+
+  private String expectedText = "textfile.txt This is text file number 1 ";
+
+  public TestZipParser(String name) { 
+    super(name); 
+  }
+
+  protected void setUp() {}
+
+  protected void tearDown() {}
+
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parser parser;
+    Parse parse;
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      protocol = ProtocolFactory.getProtocol(urlString);
+      content = protocol.getProtocolOutput(urlString).getContent();
+
+      parser = ParserFactory.getParser(content.getContentType(), urlString);
+      parse = parser.getParse(content);
+      assertTrue(parse.getText().equals(expectedText));
+    }
+  }
+
+}

Propchange: 
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to