Repository: nutch Updated Branches: refs/heads/master ecf2bb011 -> 5943d11ad
NUTCH-1308 Add main() to ZipParser Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/5943d11a Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/5943d11a Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/5943d11a Branch: refs/heads/master Commit: 5943d11ad976f51ab0e861f8ed128ace950c246d Parents: ecf2bb0 Author: Sebastian Nagel <[email protected]> Authored: Sat Jul 2 12:44:39 2016 +0200 Committer: Sebastian Nagel <[email protected]> Committed: Sat Jul 2 12:44:39 2016 +0200 ---------------------------------------------------------------------- .../org/apache/nutch/parse/zip/ZipParser.java | 31 ++++++++++++++++++++ 1 file changed, 31 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/5943d11a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java index 5d0c2f7..f441fd0 100644 --- a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java +++ b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java @@ -18,20 +18,26 @@ package org.apache.nutch.parse.zip; import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseImpl; import org.apache.nutch.parse.ParseResult; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.Parser; import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; import org.apache.hadoop.conf.Configuration; /** @@ -110,4 +116,29 @@ public class ZipParser implements Parser { return this.conf; } + public static void main(String[] args) throws IOException { + if (args.length < 1) { + System.out.println("ZipParser <zip_file>"); + System.exit(1); + } + File file = new File(args[0]); + String url = "file:"+file.getCanonicalPath(); + FileInputStream in = new FileInputStream(file); + byte[] bytes = new byte[in.available()]; + in.read(bytes); + in.close(); + Configuration conf = NutchConfiguration.create(); + ZipParser parser = new ZipParser(); + parser.setConf(conf); + Metadata meta = new Metadata(); + meta.add(Response.CONTENT_LENGTH, ""+file.length()); + ParseResult parseResult = parser.getParse(new Content(url, url, bytes, + "application/zip", meta, conf)); + Parse p = parseResult.get(url); + System.out.println(parseResult.size()); + System.out.println("Parse Text:"); + System.out.println(p.getText()); + System.out.println("Parse Data:"); + System.out.println(p.getData()); + } }
