Author: jerome Date: Sun Sep 4 13:53:49 2005 New Revision: 278626 URL: http://svn.apache.org/viewcvs?rev=278626&view=rev Log: NUTCH-53, Parser plugin for Zip files (Rohit Kulkarni)
Added: lucene/nutch/trunk/src/plugin/parse-zip/ lucene/nutch/trunk/src/plugin/parse-zip/build.xml (with props) lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml (with props) lucene/nutch/trunk/src/plugin/parse-zip/sample/ lucene/nutch/trunk/src/plugin/parse-zip/sample/test.zip (with props) lucene/nutch/trunk/src/plugin/parse-zip/src/ lucene/nutch/trunk/src/plugin/parse-zip/src/java/ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (with props) lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (with props) lucene/nutch/trunk/src/plugin/parse-zip/src/test/ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (with props) Modified: lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=278626&r1=278625&r2=278626&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Sun Sep 4 13:53:49 2005 @@ -21,6 +21,7 @@ <!-- <ant dir="parse-mp3" target="deploy"/> --> <!-- <ant dir="parse-rtf" target="deploy"/> --> <ant dir="parse-ext" target="deploy"/> + <ant dir="parse-zip" target="deploy"/> <ant dir="index-basic" target="deploy"/> <ant dir="index-more" target="deploy"/> <ant dir="query-basic" target="deploy"/> @@ -48,6 +49,7 @@ <!-- <ant dir="parse-mp3" target="test"/> --> <!-- <ant dir="parse-rtf" target="test"/> --> <ant dir="parse-ext" target="test"/> + <ant dir="parse-zip" target="test"/> <ant dir="creativecommons" target="test"/> <ant dir="languageidentifier" target="test"/> <ant dir="ontology" target="test"/> @@ -72,6 +74,7 @@ <ant dir="parse-mp3" target="clean"/> <ant dir="parse-rtf" target="clean"/> <ant dir="parse-ext" target="clean"/> + <ant dir="parse-zip" target="clean"/> <ant dir="index-basic" target="clean"/> <ant dir="index-more" target="clean"/> <ant dir="query-basic" target="clean"/> Added: lucene/nutch/trunk/src/plugin/parse-zip/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/build.xml?rev=278626&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-zip/build.xml (added) +++ lucene/nutch/trunk/src/plugin/parse-zip/build.xml Sun Sep 4 13:53:49 2005 @@ -0,0 +1,15 @@ +<?xml version="1.0"?> + +<project name="parse-zip" default="jar"> + + <import file="../build-plugin.xml"/> + + <!-- for junit test --> + <mkdir dir="${build.test}/data" /> + <copy todir="${build.test}/data"> + <fileset dir="sample"> + <include name="*.zip" /> + </fileset> + </copy> + +</project> Propchange: lucene/nutch/trunk/src/plugin/parse-zip/build.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml?rev=278626&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml (added) +++ lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml Sun Sep 4 13:53:49 2005 @@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="UTF-8"?> +<plugin + id="parse-zip" + name="Zip Parse Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="parse-zip.jar"> + <export name="*"/> + </library> + </runtime> + + <extension id="org.apache.nutch.parse.zip" + name="ZipParser" + point="org.apache.nutch.parse.Parser"> + + <implementation id="org.apache.nutch.parse.zip.ZipParser" + class="org.apache.nutch.parse.zip.ZipParser" + contentType="application/zip" + pathSuffix="zip"/> + </extension> + +</plugin> Propchange: lucene/nutch/trunk/src/plugin/parse-zip/plugin.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-zip/sample/test.zip URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/sample/test.zip?rev=278626&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-zip/sample/test.zip ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=278626&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (added) +++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Sun Sep 4 13:53:49 2005 @@ -0,0 +1,101 @@ +/* + * ZipParser.java + * + * Nutch parse plugin for zip files - Content Type : application/zip + */ + +package org.apache.nutch.parse.zip; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.util.Properties; +import java.util.logging.Logger; +import java.util.ArrayList; +import java.util.List; + +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.LogFormatter; + +/** + * + * @author Rohit Kulkarni & Ashish Vaidya + * ZipParser class based on MSPowerPointParser class by Stephan Strittmatter + */ +public class ZipParser implements Parser{ + + private static final Logger LOG = LogFormatter.getLogger(ZipParser.class.getName()); + /** Creates a new instance of ZipParser */ + public ZipParser() { + } + + public Parse getParse(final Content content) { + + // check that contentType is one we can handle + final String contentType = content.getContentType(); + if (contentType != null && !contentType.startsWith("application/zip")) { + return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT, + "Content-Type not application/zip: " + contentType).getEmptyParse(); + } + + String resultText = null; + String resultTitle = null; + Outlink[] outlinks = null; + List outLinksList = new ArrayList(); + Properties properties = null; + + try { + final String contentLen = content.get("Content-Length"); + final int len = Integer.parseInt(contentLen); + System.out.println("ziplen: " + len); + final byte[] contentInBytes = content.getContent(); + final ByteArrayInputStream bainput = new ByteArrayInputStream(contentInBytes); + final InputStream input = bainput; + + if (contentLen != null && contentInBytes.length != len) { + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_TRUNCATED, + "Content truncated at " + contentInBytes.length + + " bytes. Parser can't handle incomplete pdf file.").getEmptyParse(); + } + + ZipTextExtractor extractor = new ZipTextExtractor(); + + // extract text + resultText = extractor.extractText(new ByteArrayInputStream(contentInBytes), + content.getUrl(), outLinksList); + + } catch (Exception e) { + return new ParseStatus(ParseStatus.FAILED, + "Can't be handled as Zip document. " + e).getEmptyParse(); + } + + // collect meta data + final Properties metadata = new Properties(); + metadata.putAll(content.getMetadata()); // copy through + + if (resultText == null) { + resultText = ""; + } + + if (resultTitle == null) { + resultTitle = ""; + } + + outlinks = (Outlink[])outLinksList.toArray(new Outlink[0]); + final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, + resultTitle, + outlinks, + metadata); + + LOG.finest("Zip file parsed sucessfully !!"); + return new ParseImpl(resultText, parseData); + } + +} Propchange: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=278626&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (added) +++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Sun Sep 4 13:53:49 2005 @@ -0,0 +1,119 @@ +/* + * ZipTextExtractor.java + * + * + */ + +package org.apache.nutch.parse.zip; + +import java.util.logging.Logger; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import java.net.URL; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParserFactory; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.LogFormatter; + +/** + * + * @author Rohit Kulkarni & Ashish Vaidya + */ +public class ZipTextExtractor { + public static final Logger LOG = LogFormatter.getLogger(ZipTextExtractor.class.getName()); + + /** Creates a new instance of ZipTextExtractor */ + public ZipTextExtractor() { + } + + public String extractText(InputStream input, String url, List outLinksList) throws IOException { + String resultText = ""; + byte temp; + + ZipInputStream zin = new ZipInputStream(input); + + ZipEntry entry; + + while ((entry = zin.getNextEntry()) != null) { + + if (!entry.isDirectory()) { + int size = (int) entry.getSize(); + byte[] b = new byte[size]; + for(int x = 0; x < size; x++) { + int err = zin.read(); + if(err != -1) { + b[x] = (byte)err; + } + } + String newurl = url + "/"; + String fname = entry.getName(); + newurl += fname; + URL aURL = new URL(newurl); + String base = aURL.toString(); + int i = fname.lastIndexOf('.'); + if (i != -1) { + // file name has extension + String contentType = ""; + String ext = fname.substring(i + 1, fname.length()); + if (ext.equalsIgnoreCase("txt") || ext.equalsIgnoreCase("c") + || ext.equalsIgnoreCase("cc") || ext.equalsIgnoreCase("pl") + || ext.equalsIgnoreCase("sh") || ext.equalsIgnoreCase("java") + || ext.equalsIgnoreCase("cpp")) { + contentType = "text/plain"; + } else if (ext.equalsIgnoreCase("html") || ext.equalsIgnoreCase("htm")) { + contentType = "text/html"; + } else if (ext.equalsIgnoreCase("xls") || ext.equalsIgnoreCase("xla") + || ext.equalsIgnoreCase("xlt") || ext.equalsIgnoreCase("xlw")) { + contentType = "application/vnd.ms-excel"; + } else if (ext.equalsIgnoreCase("ppt") || ext.equalsIgnoreCase("pps")) { + contentType = "application/vnd.ms-powerpoint"; + } else if (ext.equalsIgnoreCase("doc")) { + contentType = "application/msword"; + } else if (ext.equalsIgnoreCase("mp3")) { + contentType = "audio/mpeg"; + } else if (ext.equalsIgnoreCase("pdf")) { + contentType = "application/pdf"; + } else if (ext.equalsIgnoreCase("rtf")) { + contentType = "application/rtf"; + } else if (ext.equalsIgnoreCase("zip")) { + contentType = "application/zip"; + } + System.out.println("trying to parse " + fname); + try { + Properties metadata = new Properties(); + metadata.setProperty("Content-Length", Long.toString(entry.getSize())); + metadata.setProperty("Content-Type", contentType); + Content content = new Content(newurl, base, b, contentType, metadata); + Parser parser = ParserFactory.getParser(contentType, newurl); + Parse parse = parser.getParse(content); + ParseData theParseData = parse.getData(); + Outlink[] theOutlinks = theParseData.getOutlinks(); + + for(int count = 0; count < theOutlinks.length; count++) { + outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor())); + } + + resultText += entry.getName() + " " + parse.getText() + " "; + } catch (ParseException e) { + + LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage()); + } + } + } + } + + return resultText; + } + +} + Propchange: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?rev=278626&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (added) +++ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java Sun Sep 4 13:53:49 2005 @@ -0,0 +1,63 @@ +/* + * TestZipParser.java + */ + +package org.apache.nutch.parse.zip; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; + +import org.apache.nutch.parse.ParserFactory; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseException; + +import junit.framework.TestCase; + +/** + * Based on Unit tests for MSWordParser by John Xing + * + * @author Rohit Kulkarni & Ashish Vaidya + */ +public class TestZipParser extends TestCase { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data","."); + + // Make sure sample files are copied to "test.data" + + private String[] sampleFiles = {"test.zip"}; + + private String expectedText = "textfile.txt This is text file number 1 "; + + public TestZipParser(String name) { + super(name); + } + + protected void setUp() {} + + protected void tearDown() {} + + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parser parser; + Parse parse; + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + protocol = ProtocolFactory.getProtocol(urlString); + content = protocol.getProtocolOutput(urlString).getContent(); + + parser = ParserFactory.getParser(content.getContentType(), urlString); + parse = parser.getParse(content); + assertTrue(parse.getText().equals(expectedText)); + } + } + +} Propchange: lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java ------------------------------------------------------------------------------ svn:eol-style = native