[34/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

thammegowda Tue, 05 Jul 2016 15:50:01 -0700

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
new file mode 100644
index 0000000..b1762e6
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Test extraction of image metadata
+ */
+public class TestImageMetadata {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  private String[] sampleFiles = { "nutch_logo_tm.gif", };
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      Configuration conf = NutchConfiguration.create();
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
+
+      Assert.assertEquals("121", parse.getData().getMeta("width"));
+      Assert.assertEquals("48", parse.getData().getMeta("height"));
+    }
+  }
+
+}


http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
new file mode 100644
index 0000000..576b3df
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+
+/**
+ * Unit tests for MSWordParser.
+ * 
+ * @author John Xing
+ */
+public class TestMSWordParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-msword/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
+  private String[] sampleFiles = { "word97.doc" };
+
+  private String expectedText = "This is a sample doc file prepared for 
nutch.";
+
+  private Configuration conf;
+
+  @Before
+  public void setUp() {
+    conf = NutchConfiguration.create();
+    conf.set("file.content.limit", "-1");
+  }
+
+  public String getTextContent(String fileName) throws ProtocolException,
+      ParseException {
+    String urlString = "file:" + sampleDir + fileSeparator + fileName;
+    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    Content content = protocol.getProtocolOutput(new Text(urlString),
+        new CrawlDatum()).getContent();
+    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+        .get(content.getUrl());
+    return parse.getText();
+  }
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    for (int i = 0; i < sampleFiles.length; i++) {
+      String found = getTextContent(sampleFiles[i]);
+      Assert.assertTrue("text found : '" + found + "'",
+          found.startsWith(expectedText));
+    }
+  }
+
+  @Test
+  public void testOpeningDocs() throws ProtocolException, ParseException {
+    String[] filenames = new File(sampleDir).list();
+    for (int i = 0; i < filenames.length; i++) {
+      if (filenames[i].endsWith(".doc") == false)
+        continue;
+      Assert.assertTrue("cann't read content of " + filenames[i],
+          getTextContent(filenames[i]).length() > 0);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
new file mode 100644
index 0000000..6960bad
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.protocol.*;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Unit tests for OOParser.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class TestOOParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-oo/build.xml during plugin compilation.
+  private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
+
+  private String expectedText;
+
+  private String sampleText = "ootest.txt";
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Content content;
+    Parse parse;
+    Configuration conf = NutchConfiguration.create();
+    Protocol protocol;
+    ProtocolFactory factory = new ProtocolFactory(conf);
+
+    System.out.println("Expected : " + expectedText);
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      if (sampleFiles[i].startsWith("ootest") == false)
+        continue;
+
+      protocol = factory.getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
+
+      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+
+      // simply test for the presence of a text - the ordering of the elements
+      // may differ from what was expected
+      // in the previous tests
+      Assert.assertTrue(text != null && text.length() > 0);
+
+      System.out.println("Found " + sampleFiles[i] + ": " + text);
+    }
+  }
+
+  public TestOOParser() {
+    try {
+      // read the test string
+      FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+          + sampleText);
+      StringBuffer sb = new StringBuffer();
+      int len = 0;
+      InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
+      char[] buf = new char[1024];
+      while ((len = isr.read(buf)) > 0) {
+        sb.append(buf, 0, len);
+      }
+      isr.close();
+      expectedText = sb.toString();
+      // normalize space
+      expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
new file mode 100644
index 0000000..9884f0c
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Unit tests for PdfParser.
+ * 
+ * @author John Xing
+ */
+public class TestPdfParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-pdf/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
+  private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
+
+  private String expectedText = "A VERY SMALL PDF FILE";
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      Configuration conf = NutchConfiguration.create();
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
+
+      int index = parse.getText().indexOf(expectedText);
+      Assert.assertTrue(index > 0);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
new file mode 100644
index 0000000..f15d821
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.tika;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
+ * 
+ * @author Andy Hedges
+ */
+public class TestRTFParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-rtf/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
+  private String rtfFile = "test.rtf";
+
+  @Ignore("There seems to be an issue with line 71 e.g. text.trim()")
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    Configuration conf = NutchConfiguration.create();
+    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
+    protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
+        .getContent();
+    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(
+        content.getUrl());
+    String text = parse.getText();
+    Assert.assertEquals("The quick brown fox jumps over the lazy dog",
+        text.trim());
+
+    String title = parse.getData().getTitle();
+    Metadata meta = parse.getData().getParseMeta();
+
+    Assert.assertEquals("test rft document", title);
+    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
new file mode 100644
index 0000000..4224f93
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.tika.HTMLMetaProcessor;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for HTMLMetaProcessor. */
+public class TestRobotsMetaProcessor {
+
+  /*
+   * 
+   * some sample tags:
+   * 
+   * <meta name="robots" content="index,follow"> <meta name="robots"
+   * content="noindex,follow"> <meta name="robots" content="index,nofollow">
+   * <meta name="robots" content="noindex,nofollow">
+   * 
+   * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
+   */
+
+  public static String[] tests = {
+      "<html><head><title>test page</title>"
+          + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
+          + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"all\"> "
+          + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
+          + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,follow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,follow\"> "
+          + "<base href=\"http://www.nutch.org/\";>" + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
+          + "<base href=\"http://www.nutch.org/base/\";>" + "</head><body>"
+          + " some text" + "</body></html>",
+
+  };
+
+  public static final boolean[][] answers = { { true, true, true }, // NONE
+      { false, false, true }, // all
+      { true, true, true }, // nOnE
+      { true, true, false }, // none
+      { true, true, false }, // noindex,nofollow
+      { true, false, false }, // noindex,follow
+      { false, true, false }, // index,nofollow
+      { false, false, false }, // index,follow
+      { false, false, false }, // missing!
+  };
+
+  private URL[][] currURLsAndAnswers;
+
+  @Test
+  public void testRobotsMetaProcessor() {
+    DOMFragmentParser parser = new DOMFragmentParser();
+    ;
+
+    try {
+      currURLsAndAnswers = new URL[][] {
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org/foo/";),
+              new URL("http://www.nutch.org/";) },
+          { new URL("http://www.nutch.org";),
+              new URL("http://www.nutch.org/base/";) } };
+    } catch (Exception e) {
+      Assert.assertTrue("couldn't make test URLs!", false);
+    }
+
+    for (int i = 0; i < tests.length; i++) {
+      byte[] bytes = tests[i].getBytes();
+
+      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+
+      try {
+        parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
+      } catch (Exception e) {
+        e.printStackTrace();
+      }
+
+      HTMLMetaTags robotsMeta = new HTMLMetaTags();
+      HTMLMetaProcessor.getMetaTags(robotsMeta, node, 
currURLsAndAnswers[i][0]);
+
+      Assert.assertTrue("got index wrong on test " + i,
+          robotsMeta.getNoIndex() == answers[i][0]);
+      Assert.assertTrue("got follow wrong on test " + i,
+          robotsMeta.getNoFollow() == answers[i][1]);
+      Assert.assertTrue("got cache wrong on test " + i,
+          robotsMeta.getNoCache() == answers[i][2]);
+      Assert
+          .assertTrue(
+              "got base href wrong on test " + i + " (got "
+                  + robotsMeta.getBaseHref() + ")",
+              ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] 
== null))
+                  || ((robotsMeta.getBaseHref() != null) && robotsMeta
+                      .getBaseHref().equals(currURLsAndAnswers[i][1])));
+
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/build.xml 
b/nutch-plugins/parse-zip/build.xml
new file mode 100644
index 0000000..991ce31
--- /dev/null
+++ b/nutch-plugins/parse-zip/build.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-zip" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
+   <!-- <ant target="deploy" inheritall="false" dir="../parse-text"/>-->
+  </target>
+
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data" />
+  <copy todir="${build.test}/data">
+    <fileset dir="sample">
+      <include name="*.zip" />
+    </fileset>
+  </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/ivy.xml b/nutch-plugins/parse-zip/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/parse-zip/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/plugin.xml 
b/nutch-plugins/parse-zip/plugin.xml
new file mode 100644
index 0000000..35ec0eb
--- /dev/null
+++ b/nutch-plugins/parse-zip/plugin.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-zip"
+   name="Zip Parse Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-zip.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.parse.zip"
+              name="ZipParser" 
+              point="org.apache.nutch.parse.Parser">
+
+      <implementation id="org.apache.nutch.parse.zip.ZipParser" 
+                      class="org.apache.nutch.parse.zip.ZipParser">
+        <parameter name="contentType" value="application/zip"/>
+        <parameter name="pathSuffix"  value="zip"/>
+      </implementation>
+      
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/pom.xml b/nutch-plugins/parse-zip/pom.xml
new file mode 100644
index 0000000..b30b9a1
--- /dev/null
+++ b/nutch-plugins/parse-zip/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-zip</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-zip</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/sample/test.zip
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/sample/test.zip 
b/nutch-plugins/parse-zip/sample/test.zip
new file mode 100644
index 0000000..0c649d2
Binary files /dev/null and b/nutch-plugins/parse-zip/sample/test.zip differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java
 
b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java
new file mode 100644
index 0000000..f441fd0
--- /dev/null
+++ 
b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java
@@ -0,0 +1,144 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.zip;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * ZipParser class based on MSPowerPointParser class by Stephan Strittmatter.
+ * Nutch parse plugin for zip files - Content Type : application/zip
+ */
+public class ZipParser implements Parser {
+
+  private static final Logger LOG = LoggerFactory.getLogger(ZipParser.class);
+  private Configuration conf;
+
+  /** Creates a new instance of ZipParser */
+  public ZipParser() {
+  }
+
+  public ParseResult getParse(final Content content) {
+
+    String resultText = null;
+    String resultTitle = null;
+    Outlink[] outlinks = null;
+    List<Outlink> outLinksList = new ArrayList<Outlink>();
+
+    try {
+      final String contentLen = content.getMetadata().get(
+          Response.CONTENT_LENGTH);
+      final int len = Integer.parseInt(contentLen);
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("ziplen: " + len);
+      }
+      final byte[] contentInBytes = content.getContent();
+
+      if (contentLen != null && contentInBytes.length != len) {
+        return new ParseStatus(ParseStatus.FAILED,
+            ParseStatus.FAILED_TRUNCATED, "Content truncated at "
+                + contentInBytes.length
+                + " bytes. Parser can't handle incomplete zip file.")
+            .getEmptyParseResult(content.getUrl(), getConf());
+      }
+
+      ZipTextExtractor extractor = new ZipTextExtractor(getConf());
+
+      // extract text
+      resultText = extractor.extractText(new ByteArrayInputStream(
+          contentInBytes), content.getUrl(), outLinksList);
+
+    } catch (Exception e) {
+      return new ParseStatus(ParseStatus.FAILED,
+          "Can't be handled as Zip document. " + e).getEmptyParseResult(
+          content.getUrl(), getConf());
+    }
+
+    if (resultText == null) {
+      resultText = "";
+    }
+
+    if (resultTitle == null) {
+      resultTitle = "";
+    }
+
+    outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]);
+    final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+        resultTitle, outlinks, content.getMetadata());
+
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("Zip file parsed sucessfully !!");
+    }
+    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(
+        resultText, parseData));
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public static void main(String[] args) throws IOException {
+    if (args.length < 1) {
+      System.out.println("ZipParser <zip_file>");
+      System.exit(1);
+    }
+    File file = new File(args[0]);
+    String url = "file:"+file.getCanonicalPath();
+    FileInputStream in = new FileInputStream(file);
+    byte[] bytes = new byte[in.available()];
+    in.read(bytes);
+    in.close();
+    Configuration conf = NutchConfiguration.create();
+    ZipParser parser = new ZipParser();
+    parser.setConf(conf);
+    Metadata meta = new Metadata();
+    meta.add(Response.CONTENT_LENGTH, ""+file.length());
+    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
+        "application/zip", meta, conf));
+    Parse p = parseResult.get(url);
+    System.out.println(parseResult.size());
+    System.out.println("Parse Text:");
+    System.out.println(p.getText());
+    System.out.println("Parse Data:");
+    System.out.println(p.getData());
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
 
b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
new file mode 100644
index 0000000..b454727
--- /dev/null
+++ 
b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.zip;
+
+// JDK imports
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+import java.net.URL;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.protocol.Content;
+import org.apache.tika.Tika;
+
+/**
+ * 
+ * @author Rohit Kulkarni & Ashish Vaidya
+ */
+public class ZipTextExtractor {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(ZipTextExtractor.class);
+
+  private Configuration conf;
+
+  /** Creates a new instance of ZipTextExtractor */
+  public ZipTextExtractor(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public String extractText(InputStream input, String url,
+      List<Outlink> outLinksList) throws IOException {
+    String resultText = "";
+    ZipInputStream zin = new ZipInputStream(input);
+    ZipEntry entry;
+
+    while ((entry = zin.getNextEntry()) != null) {
+
+      if (!entry.isDirectory()) {
+        int size = (int) entry.getSize();
+        byte[] b = new byte[size];
+        for (int x = 0; x < size; x++) {
+          int err = zin.read();
+          if (err != -1) {
+            b[x] = (byte) err;
+          }
+        }
+        String newurl = url + "/";
+        String fname = entry.getName();
+        newurl += fname;
+        URL aURL = new URL(newurl);
+        String base = aURL.toString();
+        int i = fname.lastIndexOf('.');
+        if (i != -1) {
+          // Trying to resolve the Mime-Type
+          Tika tika = new Tika();
+          String contentType = tika.detect(fname);
+          try {
+            Metadata metadata = new Metadata();
+            metadata.set(Response.CONTENT_LENGTH,
+                Long.toString(entry.getSize()));
+            metadata.set(Response.CONTENT_TYPE, contentType);
+            Content content = new Content(newurl, base, b, contentType,
+                metadata, this.conf);
+            Parse parse = new ParseUtil(this.conf).parse(content).get(
+                content.getUrl());
+            ParseData theParseData = parse.getData();
+            Outlink[] theOutlinks = theParseData.getOutlinks();
+
+            for (int count = 0; count < theOutlinks.length; count++) {
+              outLinksList.add(new Outlink(theOutlinks[count].getToUrl(),
+                  theOutlinks[count].getAnchor()));
+            }
+
+            resultText += entry.getName() + " " + parse.getText() + " ";
+          } catch (ParseException e) {
+            if (LOG.isInfoEnabled()) {
+              LOG.info("fetch okay, but can't parse " + fname + ", reason: "
+                  + e.getMessage());
+            }
+          }
+        }
+      }
+    }
+
+    return resultText;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java
 
b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java
new file mode 100644
index 0000000..fc81ee1
--- /dev/null
+++ 
b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse ZIP files: embedded files are recursively passed to appropriate 
parsers.
+ */
+package org.apache.nutch.parse.zip;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
 
b/nutch-plugins/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
new file mode 100644
index 0000000..17e386a
--- /dev/null
+++ 
b/nutch-plugins/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.zip;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Based on Unit tests for MSWordParser by John Xing
+ * 
+ * @author Rohit Kulkarni & Ashish Vaidya
+ */
+public class TestZipParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  // Make sure sample files are copied to "test.data"
+
+  private String[] sampleFiles = { "test.zip" };
+
+  private String expectedText = "textfile.txt This is text file number 1 ";
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    Configuration conf = NutchConfiguration.create();
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(
+          content.getUrl());
+      Assert.assertTrue(parse.getText().equals(expectedText));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/build-ivy.xml 
b/nutch-plugins/parsefilter-naivebayes/build-ivy.xml
new file mode 100644
index 0000000..22bee5f
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parsefilter-naivebayes" default="deps-jar" 
xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without 
any special installation -->
+        <get 
src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar";
 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not 
already dropped
+              it into ant's lib dir (note that the latter copy will always 
take precedence).
+              We will not fail as long as local lib dir exists (it may be 
empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/build.xml 
b/nutch-plugins/parsefilter-naivebayes/build.xml
new file mode 100644
index 0000000..6fb7a9d
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parsefilter-naivebayes" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/ivy.xml 
b/nutch-plugins/parsefilter-naivebayes/ivy.xml
new file mode 100644
index 0000000..08cca2c
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/ivy.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+
+    <dependency org="org.apache.mahout" name="mahout-math" rev="0.10.1" />
+    <dependency org="org.apache.mahout" name="mahout-core" rev="0.9" >
+      <exclude org="org.apache.mrunit" name="mrunit"/>
+    </dependency>
+    <dependency org="org.apache.lucene" name="lucene-core" rev="5.5.0" />
+    <dependency org="org.apache.lucene" name="lucene-analyzers-common" 
rev="5.5.0" />
+
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/plugin.xml 
b/nutch-plugins/parsefilter-naivebayes/plugin.xml
new file mode 100644
index 0000000..ac15041
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/plugin.xml
@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parsefilter-naivebayes"
+   name="Naive Bayes Parse Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parsefilter-naivebayes.jar">
+         <export name="*"/>
+      </library>
+      <library name="commons-cli-2.0-mahout.jar"/>
+      <library name="commons-lang3-3.1.jar"/>
+      <library name="commons-math3-3.2.jar"/>
+      <library name="guava-14.0.1.jar"/>
+      <library name="jackson-core-asl-1.9.12.jar"/>
+      <library name="jackson-mapper-asl-1.9.12.jar"/>
+      <library name="lucene-analyzers-common-5.5.0.jar"/>
+      <library name="lucene-core-5.5.0.jar"/>
+      <library name="mahout-core-0.9.jar"/>
+      <library name="mahout-math-0.10.1.jar"/>
+      <library name="slf4j-api-1.7.12.jar"/>
+      <library name="solr-commons-csv-3.5.0.jar"/>
+      <library name="t-digest-3.1.jar"/>
+      <library name="xmlpull-1.1.3.1.jar"/>
+      <library name="xpp3_min-1.1.4c.jar"/>
+      <library name="xstream-1.4.4.jar"/> 
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.htmlparsefilter.naivebayes"
+        name="Nutch Parser Filter" 
point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="NaiveBayesHTMLParseFilter" 
+        class="org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/pom.xml 
b/nutch-plugins/parsefilter-naivebayes/pom.xml
new file mode 100644
index 0000000..0a99e47
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parsefilter-naivebayes</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parsefilter-naivebayes</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
 
b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
new file mode 100644
index 0000000..d755ff6
--- /dev/null
+++ 
b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parsefilter.naivebayes;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashMap;
+import java.io.InputStreamReader;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public class Classify {
+
+  private static int uniquewords_size = 0;
+
+  private static int numof_ir = 0;
+  private static int numwords_ir = 0;
+  private static HashMap<String, Integer> wordfreq_ir = null;
+
+  private static int numof_r = 0;
+  private static int numwords_r = 0;
+  private static HashMap<String, Integer> wordfreq_r = null;
+  private static boolean ismodel = false;
+
+  public static HashMap<String, Integer> unflattenToHashmap(String line) {
+    HashMap<String, Integer> dict = new HashMap<String, Integer>();
+
+    String dictarray[] = line.split(",");
+
+    for (String field : dictarray) {
+
+      dict.put(field.split(":")[0], Integer.valueOf(field.split(":")[1]));
+    }
+
+    return dict;
+
+  }
+
+  public static String classify(String line) throws IOException {
+
+    double prob_ir = 0;
+    double prob_r = 0;
+
+    String result = "1";
+
+    String[] linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase()
+        .split(" ");
+
+    // read the training file
+    // read the line
+    if (!ismodel) {
+      Configuration configuration = new Configuration();
+      FileSystem fs = FileSystem.get(configuration);
+
+      BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(
+          fs.open(new Path("naivebayes-model"))));
+
+      uniquewords_size = Integer.valueOf(bufferedReader.readLine());
+      bufferedReader.readLine();
+
+      numof_ir = Integer.valueOf(bufferedReader.readLine());
+      numwords_ir = Integer.valueOf(bufferedReader.readLine());
+      wordfreq_ir = unflattenToHashmap(bufferedReader.readLine());
+      bufferedReader.readLine();
+      numof_r = Integer.valueOf(bufferedReader.readLine());
+      numwords_r = Integer.valueOf(bufferedReader.readLine());
+      wordfreq_r = unflattenToHashmap(bufferedReader.readLine());
+
+      ismodel = true;
+
+      bufferedReader.close();
+
+    }
+
+    // update probabilities
+
+    for (String word : linearray) {
+      if (wordfreq_ir.containsKey(word))
+        prob_ir += Math.log(wordfreq_ir.get(word)) + 1
+            - Math.log(numwords_ir + uniquewords_size);
+      else
+        prob_ir += 1 - Math.log(numwords_ir + uniquewords_size);
+
+      if (wordfreq_r.containsKey(word))
+        prob_r += Math.log(wordfreq_r.get(word)) + 1
+            - Math.log(numwords_r + uniquewords_size);
+      else
+        prob_r += 1 - Math.log(numwords_r + uniquewords_size);
+
+    }
+
+    prob_ir += Math.log(numof_ir) - Math.log(numof_ir + numof_r);
+    prob_r += Math.log(numof_r) - Math.log(numof_ir + numof_r);
+
+    if (prob_ir > prob_r)
+      result = "0";
+    else
+      result = "1";
+
+    return result;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
 
b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
new file mode 100644
index 0000000..30810ae
--- /dev/null
+++ 
b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
@@ -0,0 +1,197 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parsefilter.naivebayes;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+
+import java.io.Reader;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.ArrayList;
+
+/**
+ * Html Parse filter that classifies the outlinks from the parseresult as
+ * relevant or irrelevant based on the parseText's relevancy (using a training
+ * file where you can give positive and negative example texts see the
+ * description of parsefilter.naivebayes.trainfile) and if found irrelevant it
+ * gives the link a second chance if it contains any of the words from the list
+ * given in parsefilter.naivebayes.wordlist. CAUTION: Set the parser.timeout to
+ * -1 or a bigger value than 30, when using this classifier.
+ */
+public class NaiveBayesParseFilter implements HtmlParseFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(NaiveBayesParseFilter.class);
+
+  public static final String TRAINFILE_MODELFILTER = 
"parsefilter.naivebayes.trainfile";
+  public static final String DICTFILE_MODELFILTER = 
"parsefilter.naivebayes.wordlist";
+
+  private Configuration conf;
+  private String inputFilePath;
+  private String dictionaryFile;
+  private ArrayList<String> wordlist = new ArrayList<String>();
+
+  public boolean filterParse(String text) {
+
+    try {
+      return classify(text);
+    } catch (IOException e) {
+      LOG.error("Error occured while classifying:: " + text + " ::"
+          + StringUtils.stringifyException(e));
+    }
+
+    return false;
+  }
+
+  public boolean filterUrl(String url) {
+
+    return containsWord(url, wordlist);
+
+  }
+
+  public boolean classify(String text) throws IOException {
+
+    // if classified as relevant "1" then return true
+    if (Classify.classify(text).equals("1"))
+      return true;
+    return false;
+  }
+
+  public void train() throws Exception {
+    // check if the model file exists, if it does then don't train
+    if (!FileSystem.get(conf).exists(new Path("naivebayes-model"))) {
+      LOG.info("Training the Naive Bayes Model");
+      Train.start(inputFilePath);
+    } else {
+      LOG.info("Model file already exists. Skipping training.");
+    }
+  }
+
+  public boolean containsWord(String url, ArrayList<String> wordlist) {
+    for (String word : wordlist) {
+      if (url.contains(word)) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    inputFilePath = conf.get(TRAINFILE_MODELFILTER);
+    dictionaryFile = conf.get(DICTFILE_MODELFILTER);
+    if (inputFilePath == null || inputFilePath.trim().length() == 0
+        || dictionaryFile == null || dictionaryFile.trim().length() == 0) {
+      String message = "ParseFilter: NaiveBayes: trainfile or wordlist not set 
in the parsefilte.naivebayes.trainfile or parsefilte.naivebayes.wordlist";
+      if (LOG.isErrorEnabled()) {
+        LOG.error(message);
+      }
+      throw new IllegalArgumentException(message);
+    }
+    try {
+      if ((FileSystem.get(conf).exists(new Path(inputFilePath)))
+          || (FileSystem.get(conf).exists(new Path(dictionaryFile)))) {
+        String message = "ParseFilter: NaiveBayes: " + inputFilePath + " or "
+            + dictionaryFile + " not found!";
+        if (LOG.isErrorEnabled()) {
+          LOG.error(message);
+        }
+        throw new IllegalArgumentException(message);
+      }
+
+      BufferedReader br = null;
+
+      String CurrentLine;
+      Reader reader = conf.getConfResourceAsReader(dictionaryFile);
+      br = new BufferedReader(reader);
+      while ((CurrentLine = br.readLine()) != null) {
+        wordlist.add(CurrentLine);
+      }
+
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+    }
+    try {
+      train();
+    } catch (Exception e) {
+
+      LOG.error("Error occured while training:: "
+          + StringUtils.stringifyException(e));
+
+    }
+
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  @Override
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    Parse parse = parseResult.get(content.getUrl());
+
+    String url = content.getBaseUrl();
+    ArrayList<Outlink> tempOutlinks = new ArrayList<Outlink>();
+    String text = parse.getText();
+
+    if (!filterParse(text)) { // kick in the second tier
+      // if parent page found
+      // irrelevant
+      LOG.info("ParseFilter: NaiveBayes: Page found irrelevant:: " + url);
+      LOG.info("Checking outlinks");
+
+      Outlink[] out = null;
+      for (int i = 0; i < parse.getData().getOutlinks().length; i++) {
+        LOG.info("ParseFilter: NaiveBayes: Outlink to check:: "
+            + parse.getData().getOutlinks()[i].getToUrl());
+        if (filterUrl(parse.getData().getOutlinks()[i].getToUrl())) {
+          tempOutlinks.add(parse.getData().getOutlinks()[i]);
+          LOG.info("ParseFilter: NaiveBayes: found relevant");
+
+        } else {
+          LOG.info("ParseFilter: NaiveBayes: found irrelevant");
+        }
+      }
+      out = new Outlink[tempOutlinks.size()];
+      for (int i = 0; i < tempOutlinks.size(); i++) {
+        out[i] = tempOutlinks.get(i);
+      }
+      parse.getData().setOutlinks(out);
+
+    } else {
+      LOG.info("ParseFilter: NaiveBayes: Page found relevant:: " + url);
+    }
+
+    return parseResult;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java
 
b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java
new file mode 100644
index 0000000..19a6911
--- /dev/null
+++ 
b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parsefilter.naivebayes;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.HashMap;
+import java.util.HashSet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public class Train {
+
+  public static String replacefirstoccuranceof(String tomatch, String line) {
+
+    int index = line.indexOf(tomatch);
+    if (index == -1) {
+      return line;
+    } else {
+      return line.substring(0, index)
+          + line.substring(index + tomatch.length());
+    }
+
+  }
+
+  public static void updateHashMap(HashMap<String, Integer> dict, String key) {
+    if (!key.equals("")) {
+      if (dict.containsKey(key))
+        dict.put(key, dict.get(key) + 1);
+      else
+        dict.put(key, 1);
+    }
+  }
+
+  public static String flattenHashMap(HashMap<String, Integer> dict) {
+    String result = "";
+
+    for (String key : dict.keySet()) {
+
+      result += key + ":" + dict.get(key) + ",";
+    }
+
+    // remove the last comma
+    result = result.substring(0, result.length() - 1);
+
+    return result;
+  }
+
+  public static void start(String filepath) throws IOException {
+
+    // two classes 0/irrelevant and 1/relevant
+
+    // calculate the total number of instances/examples per class, word count 
in
+    // each class and for each class a word:frequency map
+
+    int numof_ir = 0;
+    int numof_r = 0;
+    int numwords_ir = 0;
+    int numwords_r = 0;
+    HashSet<String> uniquewords = new HashSet<String>();
+    HashMap<String, Integer> wordfreq_ir = new HashMap<String, Integer>();
+    HashMap<String, Integer> wordfreq_r = new HashMap<String, Integer>();
+
+    String line = "";
+    String target = "";
+    String[] linearray = null;
+
+    // read the line
+    Configuration configuration = new Configuration();
+    FileSystem fs = FileSystem.get(configuration);
+
+    BufferedReader bufferedReader = new BufferedReader(
+        configuration.getConfResourceAsReader(filepath));
+
+    while ((line = bufferedReader.readLine()) != null) {
+
+      target = line.split("\t")[0];
+
+      line = replacefirstoccuranceof(target + "\t", line);
+
+      linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase().split(" ");
+
+      // update the data structures
+      if (target.equals("0")) {
+
+        numof_ir += 1;
+        numwords_ir += linearray.length;
+        for (int i = 0; i < linearray.length; i++) {
+          uniquewords.add(linearray[i]);
+          updateHashMap(wordfreq_ir, linearray[i]);
+        }
+      } else {
+
+        numof_r += 1;
+        numwords_r += linearray.length;
+        for (int i = 0; i < linearray.length; i++) {
+          uniquewords.add(linearray[i]);
+          updateHashMap(wordfreq_r, linearray[i]);
+        }
+
+      }
+
+    }
+
+    // write the model file
+
+    Path path = new Path("naivebayes-model");
+
+    Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(path,
+        true)));
+
+    writer.write(String.valueOf(uniquewords.size()) + "\n");
+    writer.write("0\n");
+    writer.write(String.valueOf(numof_ir) + "\n");
+    writer.write(String.valueOf(numwords_ir) + "\n");
+    writer.write(flattenHashMap(wordfreq_ir) + "\n");
+    writer.write("1\n");
+    writer.write(String.valueOf(numof_r) + "\n");
+    writer.write(String.valueOf(numwords_r) + "\n");
+    writer.write(flattenHashMap(wordfreq_r) + "\n");
+
+    writer.close();
+
+    bufferedReader.close();
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
 
b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
new file mode 100644
index 0000000..6a892be
--- /dev/null
+++ 
b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Html Parse filter that classifies the outlinks from the parseresult as
+ * relevant or irrelevant based on the parseText's relevancy (using a training
+ * file where you can give positive and negative example texts see the
+ * description of parsefilter.naivebayes.trainfile) and if found irrelevent
+ * it gives the link a second chance if it contains any of the words from the
+ * list given in parsefilter.naivebayes.wordlist. CAUTION: Set the
+ * parser.timeout to -1 or a bigger value than 30, when using this classifier.
+ */
+package org.apache.nutch.parsefilter.naivebayes;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-regex/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/build.xml 
b/nutch-plugins/parsefilter-regex/build.xml
new file mode 100644
index 0000000..14d1127
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parsefilter-regex" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-regex/data/regex-parsefilter.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/data/regex-parsefilter.txt 
b/nutch-plugins/parsefilter-regex/data/regex-parsefilter.txt
new file mode 100644
index 0000000..9d15cd8
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/data/regex-parsefilter.txt
@@ -0,0 +1,10 @@
+# Example configuration file for parsefilter-regex
+#
+# Parse metadata field <name> is set to true if the HTML matches the regex. The
+# source can either be html or text. If source is html, the regex is applied to
+# the entire HTML tree. If source is text, the regex is applied to the
+# extracted text.
+#
+# format: <name>\t<source>\t<regex>\n
+first  html    h1
+second text    blablabla

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-regex/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/ivy.xml 
b/nutch-plugins/parsefilter-regex/ivy.xml
new file mode 100644
index 0000000..ed4cbc3
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/ivy.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-regex/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/plugin.xml 
b/nutch-plugins/parsefilter-regex/plugin.xml
new file mode 100644
index 0000000..0725492
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parsefilter-regex"
+   name="Regex Parse Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parsefilter-regex.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.htmlparsefilter.regex"
+        name="Nutch Parser Filter" 
point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="RegexParseFilter" 
+                      
class="org.apache.nutch.parsefilter.regex.RegexParseFilter">
+          <parameter name="file" value="regex-parsefilter.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parsefilter-regex/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/pom.xml 
b/nutch-plugins/parsefilter-regex/pom.xml
new file mode 100644
index 0000000..19b6452
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parsefilter-regex</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parsefilter-regex</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

[34/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

Reply via email to