[07/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

thammegowda Tue, 05 Jul 2016 15:49:27 -0700

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java 
b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
deleted file mode 100644
index 576b3df..0000000
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.io.File;
-
-/**
- * Unit tests for MSWordParser.
- * 
- * @author John Xing
- */
-public class TestMSWordParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-  // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-msword/build.xml during plugin compilation.
-  // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
-  private String[] sampleFiles = { "word97.doc" };
-
-  private String expectedText = "This is a sample doc file prepared for 
nutch.";
-
-  private Configuration conf;
-
-  @Before
-  public void setUp() {
-    conf = NutchConfiguration.create();
-    conf.set("file.content.limit", "-1");
-  }
-
-  public String getTextContent(String fileName) throws ProtocolException,
-      ParseException {
-    String urlString = "file:" + sampleDir + fileSeparator + fileName;
-    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    Content content = protocol.getProtocolOutput(new Text(urlString),
-        new CrawlDatum()).getContent();
-    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-        .get(content.getUrl());
-    return parse.getText();
-  }
-
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-    for (int i = 0; i < sampleFiles.length; i++) {
-      String found = getTextContent(sampleFiles[i]);
-      Assert.assertTrue("text found : '" + found + "'",
-          found.startsWith(expectedText));
-    }
-  }
-
-  @Test
-  public void testOpeningDocs() throws ProtocolException, ParseException {
-    String[] filenames = new File(sampleDir).list();
-    for (int i = 0; i < filenames.length; i++) {
-      if (filenames[i].endsWith(".doc") == false)
-        continue;
-      Assert.assertTrue("cann't read content of " + filenames[i],
-          getTextContent(filenames[i]).length() > 0);
-    }
-  }
-}


http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java 
b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
deleted file mode 100644
index 6960bad..0000000
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import java.io.FileInputStream;
-import java.io.InputStreamReader;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.protocol.*;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * Unit tests for OOParser.
- * 
- * @author Andrzej Bialecki
- */
-public class TestOOParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-  // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-oo/build.xml during plugin compilation.
-  private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
-
-  private String expectedText;
-
-  private String sampleText = "ootest.txt";
-
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Content content;
-    Parse parse;
-    Configuration conf = NutchConfiguration.create();
-    Protocol protocol;
-    ProtocolFactory factory = new ProtocolFactory(conf);
-
-    System.out.println("Expected : " + expectedText);
-
-    for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-      if (sampleFiles[i].startsWith("ootest") == false)
-        continue;
-
-      protocol = factory.getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-          .get(content.getUrl());
-
-      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
-
-      // simply test for the presence of a text - the ordering of the elements
-      // may differ from what was expected
-      // in the previous tests
-      Assert.assertTrue(text != null && text.length() > 0);
-
-      System.out.println("Found " + sampleFiles[i] + ": " + text);
-    }
-  }
-
-  public TestOOParser() {
-    try {
-      // read the test string
-      FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
-          + sampleText);
-      StringBuffer sb = new StringBuffer();
-      int len = 0;
-      InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
-      char[] buf = new char[1024];
-      while ((len = isr.read(buf)) > 0) {
-        sb.append(buf, 0, len);
-      }
-      isr.close();
-      expectedText = sb.toString();
-      // normalize space
-      expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");
-    } catch (Exception e) {
-      e.printStackTrace();
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java 
b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
deleted file mode 100644
index 9884f0c..0000000
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * Unit tests for PdfParser.
- * 
- * @author John Xing
- */
-public class TestPdfParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-  // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-pdf/build.xml during plugin compilation.
-  // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
-  private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
-
-  private String expectedText = "A VERY SMALL PDF FILE";
-
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Protocol protocol;
-    Content content;
-    Parse parse;
-
-    for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-      Configuration conf = NutchConfiguration.create();
-      protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-          .get(content.getUrl());
-
-      int index = parse.getText().indexOf(expectedText);
-      Assert.assertTrue(index > 0);
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java 
b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
deleted file mode 100644
index f15d821..0000000
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.tika;
-
-// Nutch imports
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.DublinCore;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.junit.Assert;
-import org.junit.Ignore;
-import org.junit.Test;
-
-/**
- * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
- * 
- * @author Andy Hedges
- */
-public class TestRTFParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-  // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-rtf/build.xml during plugin compilation.
-  // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
-  private String rtfFile = "test.rtf";
-
-  @Ignore("There seems to be an issue with line 71 e.g. text.trim()")
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-
-    String urlString;
-    Protocol protocol;
-    Content content;
-    Parse parse;
-
-    Configuration conf = NutchConfiguration.create();
-    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
-    protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
-        .getContent();
-    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(
-        content.getUrl());
-    String text = parse.getText();
-    Assert.assertEquals("The quick brown fox jumps over the lazy dog",
-        text.trim());
-
-    String title = parse.getData().getTitle();
-    Metadata meta = parse.getData().getParseMeta();
-
-    Assert.assertEquals("test rft document", title);
-    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
-
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
 
b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
deleted file mode 100644
index 4224f93..0000000
--- 
a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
+++ /dev/null
@@ -1,156 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.tika.HTMLMetaProcessor;
-
-import java.io.ByteArrayInputStream;
-import java.net.URL;
-
-import org.xml.sax.*;
-import org.w3c.dom.*;
-import org.apache.html.dom.*;
-import org.cyberneko.html.parsers.DOMFragmentParser;
-import org.junit.Assert;
-import org.junit.Test;
-
-/** Unit tests for HTMLMetaProcessor. */
-public class TestRobotsMetaProcessor {
-
-  /*
-   * 
-   * some sample tags:
-   * 
-   * <meta name="robots" content="index,follow"> <meta name="robots"
-   * content="noindex,follow"> <meta name="robots" content="index,nofollow">
-   * <meta name="robots" content="noindex,nofollow">
-   * 
-   * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
-   */
-
-  public static String[] tests = {
-      "<html><head><title>test page</title>"
-          + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
-          + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"all\"> "
-          + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
-          + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
-          + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"noindex,follow\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"index,nofollow\"> "
-          + "</head><body>" + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>"
-          + "<meta name=\"robots\" content=\"index,follow\"> "
-          + "<base href=\"http://www.nutch.org/\";>" + "</head><body>"
-          + " some text" + "</body></html>",
-
-      "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
-          + "<base href=\"http://www.nutch.org/base/\";>" + "</head><body>"
-          + " some text" + "</body></html>",
-
-  };
-
-  public static final boolean[][] answers = { { true, true, true }, // NONE
-      { false, false, true }, // all
-      { true, true, true }, // nOnE
-      { true, true, false }, // none
-      { true, true, false }, // noindex,nofollow
-      { true, false, false }, // noindex,follow
-      { false, true, false }, // index,nofollow
-      { false, false, false }, // index,follow
-      { false, false, false }, // missing!
-  };
-
-  private URL[][] currURLsAndAnswers;
-
-  @Test
-  public void testRobotsMetaProcessor() {
-    DOMFragmentParser parser = new DOMFragmentParser();
-    ;
-
-    try {
-      currURLsAndAnswers = new URL[][] {
-          { new URL("http://www.nutch.org";), null },
-          { new URL("http://www.nutch.org";), null },
-          { new URL("http://www.nutch.org";), null },
-          { new URL("http://www.nutch.org";), null },
-          { new URL("http://www.nutch.org";), null },
-          { new URL("http://www.nutch.org";), null },
-          { new URL("http://www.nutch.org";), null },
-          { new URL("http://www.nutch.org/foo/";),
-              new URL("http://www.nutch.org/";) },
-          { new URL("http://www.nutch.org";),
-              new URL("http://www.nutch.org/base/";) } };
-    } catch (Exception e) {
-      Assert.assertTrue("couldn't make test URLs!", false);
-    }
-
-    for (int i = 0; i < tests.length; i++) {
-      byte[] bytes = tests[i].getBytes();
-
-      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
-
-      try {
-        parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
-      } catch (Exception e) {
-        e.printStackTrace();
-      }
-
-      HTMLMetaTags robotsMeta = new HTMLMetaTags();
-      HTMLMetaProcessor.getMetaTags(robotsMeta, node, 
currURLsAndAnswers[i][0]);
-
-      Assert.assertTrue("got index wrong on test " + i,
-          robotsMeta.getNoIndex() == answers[i][0]);
-      Assert.assertTrue("got follow wrong on test " + i,
-          robotsMeta.getNoFollow() == answers[i][1]);
-      Assert.assertTrue("got cache wrong on test " + i,
-          robotsMeta.getNoCache() == answers[i][2]);
-      Assert
-          .assertTrue(
-              "got base href wrong on test " + i + " (got "
-                  + robotsMeta.getBaseHref() + ")",
-              ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] 
== null))
-                  || ((robotsMeta.getBaseHref() != null) && robotsMeta
-                      .getBaseHref().equals(currURLsAndAnswers[i][1])));
-
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-zip/build.xml b/src/plugin/parse-zip/build.xml
deleted file mode 100644
index 991ce31..0000000
--- a/src/plugin/parse-zip/build.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-zip" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <!-- Deploy Unit test dependencies -->
-  <target name="deps-test">
-    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
-    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
-   <!-- <ant target="deploy" inheritall="false" dir="../parse-text"/>-->
-  </target>
-
-
-  <!-- for junit test -->
-  <mkdir dir="${build.test}/data" />
-  <copy todir="${build.test}/data">
-    <fileset dir="sample">
-      <include name="*.zip" />
-    </fileset>
-  </copy>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-zip/ivy.xml b/src/plugin/parse-zip/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/parse-zip/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-zip/plugin.xml b/src/plugin/parse-zip/plugin.xml
deleted file mode 100644
index 35ec0eb..0000000
--- a/src/plugin/parse-zip/plugin.xml
+++ /dev/null
@@ -1,46 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="parse-zip"
-   name="Zip Parse Plug-in"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="parse-zip.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.parse.zip"
-              name="ZipParser" 
-              point="org.apache.nutch.parse.Parser">
-
-      <implementation id="org.apache.nutch.parse.zip.ZipParser" 
-                      class="org.apache.nutch.parse.zip.ZipParser">
-        <parameter name="contentType" value="application/zip"/>
-        <parameter name="pathSuffix"  value="zip"/>
-      </implementation>
-      
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/sample/test.zip
----------------------------------------------------------------------
diff --git a/src/plugin/parse-zip/sample/test.zip 
b/src/plugin/parse-zip/sample/test.zip
deleted file mode 100644
index 0c649d2..0000000
Binary files a/src/plugin/parse-zip/sample/test.zip and /dev/null differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java 
b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
deleted file mode 100644
index f441fd0..0000000
--- a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
+++ /dev/null
@@ -1,144 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.zip;
-
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * ZipParser class based on MSPowerPointParser class by Stephan Strittmatter.
- * Nutch parse plugin for zip files - Content Type : application/zip
- */
-public class ZipParser implements Parser {
-
-  private static final Logger LOG = LoggerFactory.getLogger(ZipParser.class);
-  private Configuration conf;
-
-  /** Creates a new instance of ZipParser */
-  public ZipParser() {
-  }
-
-  public ParseResult getParse(final Content content) {
-
-    String resultText = null;
-    String resultTitle = null;
-    Outlink[] outlinks = null;
-    List<Outlink> outLinksList = new ArrayList<Outlink>();
-
-    try {
-      final String contentLen = content.getMetadata().get(
-          Response.CONTENT_LENGTH);
-      final int len = Integer.parseInt(contentLen);
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("ziplen: " + len);
-      }
-      final byte[] contentInBytes = content.getContent();
-
-      if (contentLen != null && contentInBytes.length != len) {
-        return new ParseStatus(ParseStatus.FAILED,
-            ParseStatus.FAILED_TRUNCATED, "Content truncated at "
-                + contentInBytes.length
-                + " bytes. Parser can't handle incomplete zip file.")
-            .getEmptyParseResult(content.getUrl(), getConf());
-      }
-
-      ZipTextExtractor extractor = new ZipTextExtractor(getConf());
-
-      // extract text
-      resultText = extractor.extractText(new ByteArrayInputStream(
-          contentInBytes), content.getUrl(), outLinksList);
-
-    } catch (Exception e) {
-      return new ParseStatus(ParseStatus.FAILED,
-          "Can't be handled as Zip document. " + e).getEmptyParseResult(
-          content.getUrl(), getConf());
-    }
-
-    if (resultText == null) {
-      resultText = "";
-    }
-
-    if (resultTitle == null) {
-      resultTitle = "";
-    }
-
-    outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]);
-    final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
-        resultTitle, outlinks, content.getMetadata());
-
-    if (LOG.isTraceEnabled()) {
-      LOG.trace("Zip file parsed sucessfully !!");
-    }
-    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(
-        resultText, parseData));
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  public static void main(String[] args) throws IOException {
-    if (args.length < 1) {
-      System.out.println("ZipParser <zip_file>");
-      System.exit(1);
-    }
-    File file = new File(args[0]);
-    String url = "file:"+file.getCanonicalPath();
-    FileInputStream in = new FileInputStream(file);
-    byte[] bytes = new byte[in.available()];
-    in.read(bytes);
-    in.close();
-    Configuration conf = NutchConfiguration.create();
-    ZipParser parser = new ZipParser();
-    parser.setConf(conf);
-    Metadata meta = new Metadata();
-    meta.add(Response.CONTENT_LENGTH, ""+file.length());
-    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
-        "application/zip", meta, conf));
-    Parse p = parseResult.get(url);
-    System.out.println(parseResult.size());
-    System.out.println("Parse Text:");
-    System.out.println(p.getText());
-    System.out.println("Parse Data:");
-    System.out.println(p.getData());
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
 
b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
deleted file mode 100644
index b454727..0000000
--- 
a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
+++ /dev/null
@@ -1,120 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.zip;
-
-// JDK imports
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.List;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
-import java.net.URL;
-
-// Commons Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-
-// Nutch imports
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.protocol.Content;
-import org.apache.tika.Tika;
-
-/**
- * 
- * @author Rohit Kulkarni & Ashish Vaidya
- */
-public class ZipTextExtractor {
-
-  public static final Logger LOG = LoggerFactory
-      .getLogger(ZipTextExtractor.class);
-
-  private Configuration conf;
-
-  /** Creates a new instance of ZipTextExtractor */
-  public ZipTextExtractor(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public String extractText(InputStream input, String url,
-      List<Outlink> outLinksList) throws IOException {
-    String resultText = "";
-    ZipInputStream zin = new ZipInputStream(input);
-    ZipEntry entry;
-
-    while ((entry = zin.getNextEntry()) != null) {
-
-      if (!entry.isDirectory()) {
-        int size = (int) entry.getSize();
-        byte[] b = new byte[size];
-        for (int x = 0; x < size; x++) {
-          int err = zin.read();
-          if (err != -1) {
-            b[x] = (byte) err;
-          }
-        }
-        String newurl = url + "/";
-        String fname = entry.getName();
-        newurl += fname;
-        URL aURL = new URL(newurl);
-        String base = aURL.toString();
-        int i = fname.lastIndexOf('.');
-        if (i != -1) {
-          // Trying to resolve the Mime-Type
-          Tika tika = new Tika();
-          String contentType = tika.detect(fname);
-          try {
-            Metadata metadata = new Metadata();
-            metadata.set(Response.CONTENT_LENGTH,
-                Long.toString(entry.getSize()));
-            metadata.set(Response.CONTENT_TYPE, contentType);
-            Content content = new Content(newurl, base, b, contentType,
-                metadata, this.conf);
-            Parse parse = new ParseUtil(this.conf).parse(content).get(
-                content.getUrl());
-            ParseData theParseData = parse.getData();
-            Outlink[] theOutlinks = theParseData.getOutlinks();
-
-            for (int count = 0; count < theOutlinks.length; count++) {
-              outLinksList.add(new Outlink(theOutlinks[count].getToUrl(),
-                  theOutlinks[count].getAnchor()));
-            }
-
-            resultText += entry.getName() + " " + parse.getText() + " ";
-          } catch (ParseException e) {
-            if (LOG.isInfoEnabled()) {
-              LOG.info("fetch okay, but can't parse " + fname + ", reason: "
-                  + e.getMessage());
-            }
-          }
-        }
-      }
-    }
-
-    return resultText;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java 
b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java
deleted file mode 100644
index fc81ee1..0000000
--- a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Parse ZIP files: embedded files are recursively passed to appropriate 
parsers.
- */
-package org.apache.nutch.parse.zip;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java 
b/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
deleted file mode 100644
index 17e386a..0000000
--- 
a/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.zip;
-
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * Based on Unit tests for MSWordParser by John Xing
- * 
- * @author Rohit Kulkarni & Ashish Vaidya
- */
-public class TestZipParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-
-  // Make sure sample files are copied to "test.data"
-
-  private String[] sampleFiles = { "test.zip" };
-
-  private String expectedText = "textfile.txt This is text file number 1 ";
-
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Protocol protocol;
-    Content content;
-    Parse parse;
-
-    Configuration conf = NutchConfiguration.create();
-    for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-      protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(
-          content.getUrl());
-      Assert.assertTrue(parse.getText().equals(expectedText));
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/build-ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parsefilter-naivebayes/build-ivy.xml 
b/src/plugin/parsefilter-naivebayes/build-ivy.xml
deleted file mode 100644
index 22bee5f..0000000
--- a/src/plugin/parsefilter-naivebayes/build-ivy.xml
+++ /dev/null
@@ -1,54 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parsefilter-naivebayes" default="deps-jar" 
xmlns:ivy="antlib:org.apache.ivy.ant">
-
-    <property name="ivy.install.version" value="2.1.0" />
-    <condition property="ivy.home" value="${env.IVY_HOME}">
-      <isset property="env.IVY_HOME" />
-    </condition>
-    <property name="ivy.home" value="${user.home}/.ant" />
-    <property name="ivy.checksums" value="" />
-    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
-    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
-
-    <target name="download-ivy" unless="offline">
-
-        <mkdir dir="${ivy.jar.dir}"/>
-        <!-- download Ivy from web site so that it can be used even without 
any special installation -->
-        <get 
src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar";
 
-             dest="${ivy.jar.file}" usetimestamp="true"/>
-    </target>
-
-    <target name="init-ivy" depends="download-ivy">
-      <!-- try to load ivy here from ivy home, in case the user has not 
already dropped
-              it into ant's lib dir (note that the latter copy will always 
take precedence).
-              We will not fail as long as local lib dir exists (it may be 
empty) and
-              ivy is in at least one of ant's lib dir or the local lib dir. -->
-        <path id="ivy.lib.path">
-            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
-
-        </path>
-        <taskdef resource="org/apache/ivy/ant/antlib.xml"
-                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
-    </target>
-
-  <target name="deps-jar" depends="init-ivy">
-    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
-  </target>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parsefilter-naivebayes/build.xml 
b/src/plugin/parsefilter-naivebayes/build.xml
deleted file mode 100644
index 6fb7a9d..0000000
--- a/src/plugin/parsefilter-naivebayes/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parsefilter-naivebayes" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parsefilter-naivebayes/ivy.xml 
b/src/plugin/parsefilter-naivebayes/ivy.xml
deleted file mode 100644
index 08cca2c..0000000
--- a/src/plugin/parsefilter-naivebayes/ivy.xml
+++ /dev/null
@@ -1,49 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-
-    <dependency org="org.apache.mahout" name="mahout-math" rev="0.10.1" />
-    <dependency org="org.apache.mahout" name="mahout-core" rev="0.9" >
-      <exclude org="org.apache.mrunit" name="mrunit"/>
-    </dependency>
-    <dependency org="org.apache.lucene" name="lucene-core" rev="5.5.0" />
-    <dependency org="org.apache.lucene" name="lucene-analyzers-common" 
rev="5.5.0" />
-
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parsefilter-naivebayes/plugin.xml 
b/src/plugin/parsefilter-naivebayes/plugin.xml
deleted file mode 100644
index ac15041..0000000
--- a/src/plugin/parsefilter-naivebayes/plugin.xml
+++ /dev/null
@@ -1,56 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="parsefilter-naivebayes"
-   name="Naive Bayes Parse Filter"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="parsefilter-naivebayes.jar">
-         <export name="*"/>
-      </library>
-      <library name="commons-cli-2.0-mahout.jar"/>
-      <library name="commons-lang3-3.1.jar"/>
-      <library name="commons-math3-3.2.jar"/>
-      <library name="guava-14.0.1.jar"/>
-      <library name="jackson-core-asl-1.9.12.jar"/>
-      <library name="jackson-mapper-asl-1.9.12.jar"/>
-      <library name="lucene-analyzers-common-5.5.0.jar"/>
-      <library name="lucene-core-5.5.0.jar"/>
-      <library name="mahout-core-0.9.jar"/>
-      <library name="mahout-math-0.10.1.jar"/>
-      <library name="slf4j-api-1.7.12.jar"/>
-      <library name="solr-commons-csv-3.5.0.jar"/>
-      <library name="t-digest-3.1.jar"/>
-      <library name="xmlpull-1.1.3.1.jar"/>
-      <library name="xpp3_min-1.1.4c.jar"/>
-      <library name="xstream-1.4.4.jar"/> 
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.htmlparsefilter.naivebayes"
-        name="Nutch Parser Filter" 
point="org.apache.nutch.parse.HtmlParseFilter">
-      <implementation id="NaiveBayesHTMLParseFilter" 
-        class="org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter"/>
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
 
b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
deleted file mode 100644
index d755ff6..0000000
--- 
a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
+++ /dev/null
@@ -1,120 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parsefilter.naivebayes;
-
-import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.HashMap;
-import java.io.InputStreamReader;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-
-public class Classify {
-
-  private static int uniquewords_size = 0;
-
-  private static int numof_ir = 0;
-  private static int numwords_ir = 0;
-  private static HashMap<String, Integer> wordfreq_ir = null;
-
-  private static int numof_r = 0;
-  private static int numwords_r = 0;
-  private static HashMap<String, Integer> wordfreq_r = null;
-  private static boolean ismodel = false;
-
-  public static HashMap<String, Integer> unflattenToHashmap(String line) {
-    HashMap<String, Integer> dict = new HashMap<String, Integer>();
-
-    String dictarray[] = line.split(",");
-
-    for (String field : dictarray) {
-
-      dict.put(field.split(":")[0], Integer.valueOf(field.split(":")[1]));
-    }
-
-    return dict;
-
-  }
-
-  public static String classify(String line) throws IOException {
-
-    double prob_ir = 0;
-    double prob_r = 0;
-
-    String result = "1";
-
-    String[] linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase()
-        .split(" ");
-
-    // read the training file
-    // read the line
-    if (!ismodel) {
-      Configuration configuration = new Configuration();
-      FileSystem fs = FileSystem.get(configuration);
-
-      BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(
-          fs.open(new Path("naivebayes-model"))));
-
-      uniquewords_size = Integer.valueOf(bufferedReader.readLine());
-      bufferedReader.readLine();
-
-      numof_ir = Integer.valueOf(bufferedReader.readLine());
-      numwords_ir = Integer.valueOf(bufferedReader.readLine());
-      wordfreq_ir = unflattenToHashmap(bufferedReader.readLine());
-      bufferedReader.readLine();
-      numof_r = Integer.valueOf(bufferedReader.readLine());
-      numwords_r = Integer.valueOf(bufferedReader.readLine());
-      wordfreq_r = unflattenToHashmap(bufferedReader.readLine());
-
-      ismodel = true;
-
-      bufferedReader.close();
-
-    }
-
-    // update probabilities
-
-    for (String word : linearray) {
-      if (wordfreq_ir.containsKey(word))
-        prob_ir += Math.log(wordfreq_ir.get(word)) + 1
-            - Math.log(numwords_ir + uniquewords_size);
-      else
-        prob_ir += 1 - Math.log(numwords_ir + uniquewords_size);
-
-      if (wordfreq_r.containsKey(word))
-        prob_r += Math.log(wordfreq_r.get(word)) + 1
-            - Math.log(numwords_r + uniquewords_size);
-      else
-        prob_r += 1 - Math.log(numwords_r + uniquewords_size);
-
-    }
-
-    prob_ir += Math.log(numof_ir) - Math.log(numof_ir + numof_r);
-    prob_r += Math.log(numof_r) - Math.log(numof_ir + numof_r);
-
-    if (prob_ir > prob_r)
-      result = "0";
-    else
-      result = "1";
-
-    return result;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
 
b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
deleted file mode 100644
index 30810ae..0000000
--- 
a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
+++ /dev/null
@@ -1,197 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parsefilter.naivebayes;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.w3c.dom.DocumentFragment;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.protocol.Content;
-
-import java.io.Reader;
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.util.ArrayList;
-
-/**
- * Html Parse filter that classifies the outlinks from the parseresult as
- * relevant or irrelevant based on the parseText's relevancy (using a training
- * file where you can give positive and negative example texts see the
- * description of parsefilter.naivebayes.trainfile) and if found irrelevant it
- * gives the link a second chance if it contains any of the words from the list
- * given in parsefilter.naivebayes.wordlist. CAUTION: Set the parser.timeout to
- * -1 or a bigger value than 30, when using this classifier.
- */
-public class NaiveBayesParseFilter implements HtmlParseFilter {
-
-  private static final Logger LOG = LoggerFactory
-      .getLogger(NaiveBayesParseFilter.class);
-
-  public static final String TRAINFILE_MODELFILTER = 
"parsefilter.naivebayes.trainfile";
-  public static final String DICTFILE_MODELFILTER = 
"parsefilter.naivebayes.wordlist";
-
-  private Configuration conf;
-  private String inputFilePath;
-  private String dictionaryFile;
-  private ArrayList<String> wordlist = new ArrayList<String>();
-
-  public boolean filterParse(String text) {
-
-    try {
-      return classify(text);
-    } catch (IOException e) {
-      LOG.error("Error occured while classifying:: " + text + " ::"
-          + StringUtils.stringifyException(e));
-    }
-
-    return false;
-  }
-
-  public boolean filterUrl(String url) {
-
-    return containsWord(url, wordlist);
-
-  }
-
-  public boolean classify(String text) throws IOException {
-
-    // if classified as relevant "1" then return true
-    if (Classify.classify(text).equals("1"))
-      return true;
-    return false;
-  }
-
-  public void train() throws Exception {
-    // check if the model file exists, if it does then don't train
-    if (!FileSystem.get(conf).exists(new Path("naivebayes-model"))) {
-      LOG.info("Training the Naive Bayes Model");
-      Train.start(inputFilePath);
-    } else {
-      LOG.info("Model file already exists. Skipping training.");
-    }
-  }
-
-  public boolean containsWord(String url, ArrayList<String> wordlist) {
-    for (String word : wordlist) {
-      if (url.contains(word)) {
-        return true;
-      }
-    }
-
-    return false;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    inputFilePath = conf.get(TRAINFILE_MODELFILTER);
-    dictionaryFile = conf.get(DICTFILE_MODELFILTER);
-    if (inputFilePath == null || inputFilePath.trim().length() == 0
-        || dictionaryFile == null || dictionaryFile.trim().length() == 0) {
-      String message = "ParseFilter: NaiveBayes: trainfile or wordlist not set 
in the parsefilte.naivebayes.trainfile or parsefilte.naivebayes.wordlist";
-      if (LOG.isErrorEnabled()) {
-        LOG.error(message);
-      }
-      throw new IllegalArgumentException(message);
-    }
-    try {
-      if ((FileSystem.get(conf).exists(new Path(inputFilePath)))
-          || (FileSystem.get(conf).exists(new Path(dictionaryFile)))) {
-        String message = "ParseFilter: NaiveBayes: " + inputFilePath + " or "
-            + dictionaryFile + " not found!";
-        if (LOG.isErrorEnabled()) {
-          LOG.error(message);
-        }
-        throw new IllegalArgumentException(message);
-      }
-
-      BufferedReader br = null;
-
-      String CurrentLine;
-      Reader reader = conf.getConfResourceAsReader(dictionaryFile);
-      br = new BufferedReader(reader);
-      while ((CurrentLine = br.readLine()) != null) {
-        wordlist.add(CurrentLine);
-      }
-
-    } catch (IOException e) {
-      LOG.error(StringUtils.stringifyException(e));
-    }
-    try {
-      train();
-    } catch (Exception e) {
-
-      LOG.error("Error occured while training:: "
-          + StringUtils.stringifyException(e));
-
-    }
-
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  @Override
-  public ParseResult filter(Content content, ParseResult parseResult,
-      HTMLMetaTags metaTags, DocumentFragment doc) {
-
-    Parse parse = parseResult.get(content.getUrl());
-
-    String url = content.getBaseUrl();
-    ArrayList<Outlink> tempOutlinks = new ArrayList<Outlink>();
-    String text = parse.getText();
-
-    if (!filterParse(text)) { // kick in the second tier
-      // if parent page found
-      // irrelevant
-      LOG.info("ParseFilter: NaiveBayes: Page found irrelevant:: " + url);
-      LOG.info("Checking outlinks");
-
-      Outlink[] out = null;
-      for (int i = 0; i < parse.getData().getOutlinks().length; i++) {
-        LOG.info("ParseFilter: NaiveBayes: Outlink to check:: "
-            + parse.getData().getOutlinks()[i].getToUrl());
-        if (filterUrl(parse.getData().getOutlinks()[i].getToUrl())) {
-          tempOutlinks.add(parse.getData().getOutlinks()[i]);
-          LOG.info("ParseFilter: NaiveBayes: found relevant");
-
-        } else {
-          LOG.info("ParseFilter: NaiveBayes: found irrelevant");
-        }
-      }
-      out = new Outlink[tempOutlinks.size()];
-      for (int i = 0; i < tempOutlinks.size(); i++) {
-        out[i] = tempOutlinks.get(i);
-      }
-      parse.getData().setOutlinks(out);
-
-    } else {
-      LOG.info("ParseFilter: NaiveBayes: Page found relevant:: " + url);
-    }
-
-    return parseResult;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java
 
b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java
deleted file mode 100644
index 19a6911..0000000
--- 
a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java
+++ /dev/null
@@ -1,148 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parsefilter.naivebayes;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.HashMap;
-import java.util.HashSet;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-
-public class Train {
-
-  public static String replacefirstoccuranceof(String tomatch, String line) {
-
-    int index = line.indexOf(tomatch);
-    if (index == -1) {
-      return line;
-    } else {
-      return line.substring(0, index)
-          + line.substring(index + tomatch.length());
-    }
-
-  }
-
-  public static void updateHashMap(HashMap<String, Integer> dict, String key) {
-    if (!key.equals("")) {
-      if (dict.containsKey(key))
-        dict.put(key, dict.get(key) + 1);
-      else
-        dict.put(key, 1);
-    }
-  }
-
-  public static String flattenHashMap(HashMap<String, Integer> dict) {
-    String result = "";
-
-    for (String key : dict.keySet()) {
-
-      result += key + ":" + dict.get(key) + ",";
-    }
-
-    // remove the last comma
-    result = result.substring(0, result.length() - 1);
-
-    return result;
-  }
-
-  public static void start(String filepath) throws IOException {
-
-    // two classes 0/irrelevant and 1/relevant
-
-    // calculate the total number of instances/examples per class, word count 
in
-    // each class and for each class a word:frequency map
-
-    int numof_ir = 0;
-    int numof_r = 0;
-    int numwords_ir = 0;
-    int numwords_r = 0;
-    HashSet<String> uniquewords = new HashSet<String>();
-    HashMap<String, Integer> wordfreq_ir = new HashMap<String, Integer>();
-    HashMap<String, Integer> wordfreq_r = new HashMap<String, Integer>();
-
-    String line = "";
-    String target = "";
-    String[] linearray = null;
-
-    // read the line
-    Configuration configuration = new Configuration();
-    FileSystem fs = FileSystem.get(configuration);
-
-    BufferedReader bufferedReader = new BufferedReader(
-        configuration.getConfResourceAsReader(filepath));
-
-    while ((line = bufferedReader.readLine()) != null) {
-
-      target = line.split("\t")[0];
-
-      line = replacefirstoccuranceof(target + "\t", line);
-
-      linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase().split(" ");
-
-      // update the data structures
-      if (target.equals("0")) {
-
-        numof_ir += 1;
-        numwords_ir += linearray.length;
-        for (int i = 0; i < linearray.length; i++) {
-          uniquewords.add(linearray[i]);
-          updateHashMap(wordfreq_ir, linearray[i]);
-        }
-      } else {
-
-        numof_r += 1;
-        numwords_r += linearray.length;
-        for (int i = 0; i < linearray.length; i++) {
-          uniquewords.add(linearray[i]);
-          updateHashMap(wordfreq_r, linearray[i]);
-        }
-
-      }
-
-    }
-
-    // write the model file
-
-    Path path = new Path("naivebayes-model");
-
-    Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(path,
-        true)));
-
-    writer.write(String.valueOf(uniquewords.size()) + "\n");
-    writer.write("0\n");
-    writer.write(String.valueOf(numof_ir) + "\n");
-    writer.write(String.valueOf(numwords_ir) + "\n");
-    writer.write(flattenHashMap(wordfreq_ir) + "\n");
-    writer.write("1\n");
-    writer.write(String.valueOf(numof_r) + "\n");
-    writer.write(String.valueOf(numwords_r) + "\n");
-    writer.write(flattenHashMap(wordfreq_r) + "\n");
-
-    writer.close();
-
-    bufferedReader.close();
-
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
 
b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
deleted file mode 100644
index 6a892be..0000000
--- 
a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Html Parse filter that classifies the outlinks from the parseresult as
- * relevant or irrelevant based on the parseText's relevancy (using a training
- * file where you can give positive and negative example texts see the
- * description of parsefilter.naivebayes.trainfile) and if found irrelevent
- * it gives the link a second chance if it contains any of the words from the
- * list given in parsefilter.naivebayes.wordlist. CAUTION: Set the
- * parser.timeout to -1 or a bigger value than 30, when using this classifier.
- */
-package org.apache.nutch.parsefilter.naivebayes;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-regex/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parsefilter-regex/build.xml 
b/src/plugin/parsefilter-regex/build.xml
deleted file mode 100644
index 14d1127..0000000
--- a/src/plugin/parsefilter-regex/build.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parsefilter-regex" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <!-- for junit test -->
-  <mkdir dir="${build.test}/data"/>
-  <copy todir="${build.test}/data">
-    <fileset dir="data" />
-  </copy>
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-regex/data/regex-parsefilter.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parsefilter-regex/data/regex-parsefilter.txt 
b/src/plugin/parsefilter-regex/data/regex-parsefilter.txt
deleted file mode 100644
index 9d15cd8..0000000
--- a/src/plugin/parsefilter-regex/data/regex-parsefilter.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# Example configuration file for parsefilter-regex
-#
-# Parse metadata field <name> is set to true if the HTML matches the regex. The
-# source can either be html or text. If source is html, the regex is applied to
-# the entire HTML tree. If source is text, the regex is applied to the
-# extracted text.
-#
-# format: <name>\t<source>\t<regex>\n
-first  html    h1
-second text    blablabla

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-regex/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parsefilter-regex/ivy.xml 
b/src/plugin/parsefilter-regex/ivy.xml
deleted file mode 100644
index ed4cbc3..0000000
--- a/src/plugin/parsefilter-regex/ivy.xml
+++ /dev/null
@@ -1,37 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-regex/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parsefilter-regex/plugin.xml 
b/src/plugin/parsefilter-regex/plugin.xml
deleted file mode 100644
index 0725492..0000000
--- a/src/plugin/parsefilter-regex/plugin.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="parsefilter-regex"
-   name="Regex Parse Filter"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="parsefilter-regex.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.htmlparsefilter.regex"
-        name="Nutch Parser Filter" 
point="org.apache.nutch.parse.HtmlParseFilter">
-      <implementation id="RegexParseFilter" 
-                      
class="org.apache.nutch.parsefilter.regex.RegexParseFilter">
-          <parameter name="file" value="regex-parsefilter.txt"/>
-      </implementation>
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
 
b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
deleted file mode 100644
index 0752c91..0000000
--- 
a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
+++ /dev/null
@@ -1,199 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parsefilter.regex;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.FileReader;
-import java.io.Reader;
-import java.io.StringReader;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.plugin.Extension;
-import org.apache.nutch.plugin.PluginRepository;
-import org.apache.nutch.protocol.Content;
-
-import org.apache.commons.lang.StringUtils;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.w3c.dom.*;
-
-/**
- * RegexParseFilter. If a regular expression matches either HTML or 
- * extracted text, a configurable field is set to true.
- */
-public class RegexParseFilter implements HtmlParseFilter {
-  
-  private static final Logger LOG = 
LoggerFactory.getLogger(RegexParseFilter.class);
-  private static String attributeFile = null;
-  private String regexFile = null;
-  
-  private Configuration conf;
-  private DocumentFragment doc;
-  
-  private static final Map<String,RegexRule> rules = new 
HashMap<String,RegexRule>();
-  
-  public RegexParseFilter() {}
-  
-  public RegexParseFilter(String regexFile) {
-    this.regexFile = regexFile;
-  }
-
-  public ParseResult filter(Content content, ParseResult parseResult, 
HTMLMetaTags metaTags, DocumentFragment doc) {
-    Parse parse = parseResult.get(content.getUrl());
-    String html = new String(content.getContent());
-    String text = parse.getText();
-    
-    for (Map.Entry<String, RegexRule> entry : rules.entrySet()) {
-      String field = entry.getKey();
-      RegexRule regexRule = entry.getValue();
-      
-      String source = null;
-      if (regexRule.source.equalsIgnoreCase("html")) {
-        source = html;
-      }
-      if (regexRule.source.equalsIgnoreCase("text")) {
-        source = text;
-      }
-      
-      if (source == null) {
-        LOG.error("source for regex rule: " + field + " misconfigured");
-      }
-      
-      if (matches(source, regexRule.regex)) {
-        parse.getData().getParseMeta().set(field, "true");
-      } else {
-        parse.getData().getParseMeta().set(field, "false");
-      }
-    }
-    
-    return parseResult;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-
-    // get the extensions for domain urlfilter
-    String pluginName = "parsefilter-regex";
-    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
-      HtmlParseFilter.class.getName()).getExtensions();
-    for (int i = 0; i < extensions.length; i++) {
-      Extension extension = extensions[i];
-      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
-        attributeFile = extension.getAttribute("file");
-        break;
-      }
-    }
-
-    // handle blank non empty input
-    if (attributeFile != null && attributeFile.trim().equals("")) {
-      attributeFile = null;
-    }
-
-    if (attributeFile != null) {
-      if (LOG.isInfoEnabled()) {
-        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
-          + " as " + attributeFile);
-      }
-    }
-    else {
-      if (LOG.isWarnEnabled()) {
-        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
-          + pluginName);
-      }
-    }
-
-    // domain file and attribute "file" take precedence if defined
-    String file = conf.get("parsefilter.regex.file");
-    String stringRules = conf.get("parsefilter.regex.rules");
-    if (regexFile != null) {
-      file = regexFile;
-    }
-    else if (attributeFile != null) {
-      file = attributeFile;
-    }
-    Reader reader = null;
-    if (stringRules != null) { // takes precedence over files
-      reader = new StringReader(stringRules);
-    } else {
-      reader = conf.getConfResourceAsReader(file);
-    }
-    try {
-      if (reader == null) {
-        reader = new FileReader(file);
-      }
-      readConfiguration(reader);
-    }
-    catch (IOException e) {
-      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
-    }
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-  
-  private boolean matches(String value, Pattern pattern) {
-    if (value != null) {
-      Matcher matcher = pattern.matcher(value);
-      return matcher.find();
-    }
-       
-    return false;
-  }
-  
-  private synchronized void readConfiguration(Reader configReader) throws 
IOException {
-    if (rules.size() > 0) {
-      return;
-    }
-
-    String line;
-    BufferedReader reader = new BufferedReader(configReader);
-    while ((line = reader.readLine()) != null) {
-      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
-        line = line.trim();
-        String[] parts = line.split("\t");
-
-        String field = parts[0].trim();
-        String source = parts[1].trim();
-        String regex = parts[2].trim();
-        
-        rules.put(field, new RegexRule(source, regex));
-      }
-    }
-  }
-  
-  private static class RegexRule {
-    public RegexRule(String source, String regex) {
-      this.source = source;
-      this.regex = Pattern.compile(regex);
-    }
-    String source;
-    Pattern regex;
-  }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java
 
b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java
deleted file mode 100644
index f8f46ee..0000000
--- 
a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * RegexParseFilter. If a regular expression matches either HTML or 
- * extracted text, a configurable field is set to true.
- */
-package org.apache.nutch.parsefilter.regex;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
 
b/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
deleted file mode 100644
index 9bd7149..0000000
--- 
a/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parsefilter.regex;
-
-import java.net.MalformedURLException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConfiguration;
-import junit.framework.TestCase;
-
-public class TestRegexParseFilter extends TestCase {
-
-  private final static String SEPARATOR = System.getProperty("file.separator");
-  private final static String SAMPLES = System.getProperty("test.data", ".");
-
-  public void testPositiveFilter() throws Exception {
-    Configuration conf = NutchConfiguration.create();
-
-    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
-    RegexParseFilter filter = new RegexParseFilter(file);
-    filter.setConf(conf);
-
-    String url = "http://nutch.apache.org/";;
-    String html = "<body><html><h1>nutch</h1><p>this is the extracted text 
blablabla</p></body></html>";
-    Content content = new Content(url, url, html.getBytes("UTF-8"), 
"text/html", new Metadata(), conf);
-    Parse parse = new ParseImpl("nutch this is the extracted text blablabla", 
new ParseData());
-    
-    ParseResult result = ParseResult.createParseResult(url, parse);
-    result = filter.filter(content, result, null, null);
-
-    Metadata meta = parse.getData().getParseMeta();
-    
-    assertEquals("true", meta.get("first"));
-    assertEquals("true", meta.get("second"));
-  }
-  
-  public void testNegativeFilter() throws Exception {
-    Configuration conf = NutchConfiguration.create();
-
-    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
-    RegexParseFilter filter = new RegexParseFilter(file);
-    filter.setConf(conf);
-
-    String url = "http://nutch.apache.org/";;
-    String html = "<body><html><h2>nutch</h2><p>this is the extracted text no 
bla</p></body></html>";
-    Content content = new Content(url, url, html.getBytes("UTF-8"), 
"text/html", new Metadata(), conf);
-    Parse parse = new ParseImpl("nutch this is the extracted text bla", new 
ParseData());
-    
-    ParseResult result = ParseResult.createParseResult(url, parse);
-    result = filter.filter(content, result, null, null);
-
-    Metadata meta = parse.getData().getParseMeta();
-    
-    assertEquals("false", meta.get("first"));
-    assertEquals("false", meta.get("second"));
-  }
-}
\ No newline at end of file

[07/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

Reply via email to