[66/69] [abbrv] nutch git commit: Moved test sources to maven standard directory

thammegowda Tue, 05 Jul 2016 15:50:01 -0700

Moved test sources to maven standard directory


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/20d28406
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/20d28406
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/20d28406

Branch: refs/heads/NUTCH-2292
Commit: 20d284068bdb918b0eea1a614644c5773bb42a12
Parents: ffa1678
Author: Thamme Gowda <[email protected]>
Authored: Tue Jul 5 15:21:52 2016 -0700
Committer: Thamme Gowda <[email protected]>
Committed: Tue Jul 5 15:21:52 2016 -0700

----------------------------------------------------------------------
 .../nutch/TestCCParseFilter.java                |  73 +++
 .../nutch/TestCCParseFilter.java                |  73 ---
 .../apache/nutch/parse/feed/TestFeedParser.java | 124 +++++
 .../apache/nutch/parse/feed/TestFeedParser.java | 124 -----
 .../anchor/TestAnchorIndexingFilter.java        |  67 +++
 .../anchor/TestAnchorIndexingFilter.java        |  67 ---
 .../indexer/basic/TestBasicIndexingFilter.java  |  99 ++++
 .../indexer/basic/TestBasicIndexingFilter.java  |  99 ----
 .../indexer/links/TestLinksIndexingFilter.java  | 218 +++++++++
 .../org/apache/nutch/parse/TestOutlinks.java    |  54 +++
 .../indexer/links/TestLinksIndexingFilter.java  | 218 ---------
 .../org/apache/nutch/parse/TestOutlinks.java    |  54 ---
 .../indexer/more/TestMoreIndexingFilter.java    | 123 +++++
 .../indexer/more/TestMoreIndexingFilter.java    | 123 -----
 .../nutch/indexer/replace/TestIndexReplace.java | 456 +++++++++++++++++++
 .../nutch/indexer/replace/TestIndexReplace.java | 456 -------------------
 .../staticfield/TestStaticFieldIndexerTest.java | 194 ++++++++
 .../staticfield/TestStaticFieldIndexerTest.java | 194 --------
 .../analysis/lang/TestHTMLLanguageParser.java   | 149 ++++++
 .../java/org/apache/nutch/analysis/lang/da.test | 108 +++++
 .../java/org/apache/nutch/analysis/lang/de.test | 104 +++++
 .../java/org/apache/nutch/analysis/lang/el.test | 109 +++++
 .../java/org/apache/nutch/analysis/lang/en.test | 105 +++++
 .../java/org/apache/nutch/analysis/lang/es.test | 107 +++++
 .../java/org/apache/nutch/analysis/lang/fi.test | 106 +++++
 .../java/org/apache/nutch/analysis/lang/fr.test | 105 +++++
 .../java/org/apache/nutch/analysis/lang/it.test | 109 +++++
 .../java/org/apache/nutch/analysis/lang/nl.test | 105 +++++
 .../java/org/apache/nutch/analysis/lang/pt.test | 105 +++++
 .../java/org/apache/nutch/analysis/lang/sv.test | 108 +++++
 .../nutch/analysis/lang/test-referencial.txt    |  10 +
 .../analysis/lang/TestHTMLLanguageParser.java   | 149 ------
 .../test/org/apache/nutch/analysis/lang/da.test | 108 -----
 .../test/org/apache/nutch/analysis/lang/de.test | 104 -----
 .../test/org/apache/nutch/analysis/lang/el.test | 109 -----
 .../test/org/apache/nutch/analysis/lang/en.test | 105 -----
 .../test/org/apache/nutch/analysis/lang/es.test | 107 -----
 .../test/org/apache/nutch/analysis/lang/fi.test | 106 -----
 .../test/org/apache/nutch/analysis/lang/fr.test | 105 -----
 .../test/org/apache/nutch/analysis/lang/it.test | 109 -----
 .../test/org/apache/nutch/analysis/lang/nl.test | 105 -----
 .../test/org/apache/nutch/analysis/lang/pt.test | 105 -----
 .../test/org/apache/nutch/analysis/lang/sv.test | 108 -----
 .../nutch/analysis/lang/test-referencial.txt    |  10 -
 .../protocol/http/api/TestRobotRulesParser.java | 123 +++++
 .../protocol/http/api/TestRobotRulesParser.java | 123 -----
 .../filter/MimeTypeIndexingFilterTest.java      | 114 +++++
 .../filter/MimeTypeIndexingFilterTest.java      | 114 -----
 .../apache/nutch/parse/ext/TestExtParser.java   | 130 ++++++
 .../apache/nutch/parse/ext/TestExtParser.java   | 130 ------
 .../nutch/parse/html/TestDOMContentUtils.java   | 347 ++++++++++++++
 .../apache/nutch/parse/html/TestHtmlParser.java | 122 +++++
 .../parse/html/TestRobotsMetaProcessor.java     | 155 +++++++
 .../nutch/parse/html/TestDOMContentUtils.java   | 347 --------------
 .../apache/nutch/parse/html/TestHtmlParser.java | 122 -----
 .../parse/html/TestRobotsMetaProcessor.java     | 155 -------
 .../nutch/parse/metatags/TestMetatagParser.java | 104 +++++
 .../nutch/parse/metatags/TestMetatagParser.java | 104 -----
 .../nutch/parse/replace/TestParseReplace.java   |  68 +++
 .../nutch/parse/replace/TestParseReplace.java   |  68 ---
 .../apache/nutch/parse/swf/TestSWFParser.java   |  94 ++++
 .../apache/nutch/parse/swf/TestSWFParser.java   |  94 ----
 .../apache/nutch/tika/TestDOMContentUtils.java  | 337 ++++++++++++++
 .../org/apache/nutch/tika/TestFeedParser.java   | 121 +++++
 .../apache/nutch/tika/TestImageMetadata.java    |  67 +++
 .../org/apache/nutch/tika/TestMSWordParser.java |  92 ++++
 .../org/apache/nutch/tika/TestOOParser.java     | 107 +++++
 .../org/apache/nutch/tika/TestPdfParser.java    |  73 +++
 .../org/apache/nutch/tika/TestRTFParser.java    |  81 ++++
 .../nutch/tika/TestRobotsMetaProcessor.java     | 156 +++++++
 .../apache/nutch/tika/TestDOMContentUtils.java  | 337 --------------
 .../org/apache/nutch/tika/TestFeedParser.java   | 121 -----
 .../apache/nutch/tika/TestImageMetadata.java    |  67 ---
 .../org/apache/nutch/tika/TestMSWordParser.java |  92 ----
 .../org/apache/nutch/tika/TestOOParser.java     | 107 -----
 .../org/apache/nutch/tika/TestPdfParser.java    |  73 ---
 .../org/apache/nutch/tika/TestRTFParser.java    |  81 ----
 .../nutch/tika/TestRobotsMetaProcessor.java     | 156 -------
 .../apache/nutch/parse/zip/TestZipParser.java   |  71 +++
 .../apache/nutch/parse/zip/TestZipParser.java   |  71 ---
 .../parsefilter/regex/TestRegexParseFilter.java |  77 ++++
 .../parsefilter/regex/TestRegexParseFilter.java |  77 ----
 .../nutch/protocol/file/TestProtocolFile.java   |  99 ++++
 .../nutch/protocol/file/TestProtocolFile.java   |  99 ----
 .../nutch/protocol/http/TestProtocolHttp.java   | 140 ++++++
 .../nutch/protocol/http/TestProtocolHttp.java   | 140 ------
 .../httpclient/TestProtocolHttpClient.java      | 217 +++++++++
 .../httpclient/TestProtocolHttpClient.java      | 217 ---------
 .../nutch/collection/TestSubcollection.java     | 112 +++++
 .../nutch/collection/TestSubcollection.java     | 112 -----
 .../automaton/TestAutomatonURLFilter.java       |  56 +++
 .../automaton/TestAutomatonURLFilter.java       |  56 ---
 .../urlfilter/domain/TestDomainURLFilter.java   |  67 +++
 .../urlfilter/domain/TestDomainURLFilter.java   |  67 ---
 .../TestDomainBlacklistURLFilter.java           |  49 ++
 .../TestDomainBlacklistURLFilter.java           |  49 --
 .../urlfilter/prefix/TestPrefixURLFilter.java   |  79 ++++
 .../urlfilter/prefix/TestPrefixURLFilter.java   |  79 ----
 .../urlfilter/regex/TestRegexURLFilter.java     |  61 +++
 .../urlfilter/regex/TestRegexURLFilter.java     |  61 ---
 .../urlfilter/suffix/TestSuffixURLFilter.java   | 123 +++++
 .../urlfilter/suffix/TestSuffixURLFilter.java   | 123 -----
 .../urlfilter/validator/TestUrlValidator.java   |  79 ++++
 .../urlfilter/validator/TestUrlValidator.java   |  79 ----
 .../ajax/TestAjaxURLNormalizer.java             |  67 +++
 .../ajax/TestAjaxURLNormalizer.java             |  67 ---
 .../basic/TestBasicURLNormalizer.java           | 175 +++++++
 .../basic/TestBasicURLNormalizer.java           | 175 -------
 .../host/TestHostURLNormalizer.java             |  57 +++
 .../host/TestHostURLNormalizer.java             |  57 ---
 .../pass/TestPassURLNormalizer.java             |  45 ++
 .../pass/TestPassURLNormalizer.java             |  45 --
 .../protocol/TestProtocolURLNormalizer.java     |  55 +++
 .../protocol/TestProtocolURLNormalizer.java     |  55 ---
 .../TestQuerystringURLNormalizer.java           |  49 ++
 .../TestQuerystringURLNormalizer.java           |  49 --
 .../regex/TestRegexURLNormalizer.java           | 186 ++++++++
 .../regex/TestRegexURLNormalizer.java           | 186 --------
 .../slash/TestSlashURLNormalizer.java           |  73 +++
 .../slash/TestSlashURLNormalizer.java           |  73 ---
 120 files changed, 6966 insertions(+), 6966 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java
 
b/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java
new file mode 100755
index 0000000..41be9ed
--- /dev/null
+++ 
b/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.creativecommons.nutch;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.*;
+
+public class TestCCParseFilter {
+
+  private static final File testDir = new 
File(System.getProperty("test.input"));
+
+  @Test
+  public void testPages() throws Exception {
+    pageTest(new File(testDir, "anchor.html"), "http://foo.com/";,
+        "http://creativecommons.org/licenses/by-nc-sa/1.0";, "a", null);
+    // Tika returns <a> whereas parse-html returns <rel>
+    // check later
+    pageTest(new File(testDir, "rel.html"), "http://foo.com/";,
+        "http://creativecommons.org/licenses/by-nc/2.0";, "rel", null);
+    // Tika returns <a> whereas parse-html returns <rdf>
+    // check later
+    pageTest(new File(testDir, "rdf.html"), "http://foo.com/";,
+        "http://creativecommons.org/licenses/by-nc/1.0";, "rdf", "text");
+  }
+
+  public void pageTest(File file, String url, String license, String location,
+      String type) throws Exception {
+
+    String contentType = "text/html";
+    InputStream in = new FileInputStream(file);
+    ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
+    byte[] buffer = new byte[1024];
+    int i;
+    while ((i = in.read(buffer)) != -1) {
+      out.write(buffer, 0, i);
+    }
+    in.close();
+    byte[] bytes = out.toByteArray();
+    Configuration conf = NutchConfiguration.create();
+
+    Content content = new Content(url, url, bytes, contentType, new Metadata(),
+        conf);
+    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+
+    Metadata metadata = parse.getData().getParseMeta();
+    Assert.assertEquals(license, metadata.get("License-Url"));
+    Assert.assertEquals(location, metadata.get("License-Location"));
+    Assert.assertEquals(type, metadata.get("Work-Type"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
 
b/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
deleted file mode 100755
index 41be9ed..0000000
--- 
a/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.creativecommons.nutch;
-
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-import java.io.*;
-
-public class TestCCParseFilter {
-
-  private static final File testDir = new 
File(System.getProperty("test.input"));
-
-  @Test
-  public void testPages() throws Exception {
-    pageTest(new File(testDir, "anchor.html"), "http://foo.com/";,
-        "http://creativecommons.org/licenses/by-nc-sa/1.0";, "a", null);
-    // Tika returns <a> whereas parse-html returns <rel>
-    // check later
-    pageTest(new File(testDir, "rel.html"), "http://foo.com/";,
-        "http://creativecommons.org/licenses/by-nc/2.0";, "rel", null);
-    // Tika returns <a> whereas parse-html returns <rdf>
-    // check later
-    pageTest(new File(testDir, "rdf.html"), "http://foo.com/";,
-        "http://creativecommons.org/licenses/by-nc/1.0";, "rdf", "text");
-  }
-
-  public void pageTest(File file, String url, String license, String location,
-      String type) throws Exception {
-
-    String contentType = "text/html";
-    InputStream in = new FileInputStream(file);
-    ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
-    byte[] buffer = new byte[1024];
-    int i;
-    while ((i = in.read(buffer)) != -1) {
-      out.write(buffer, 0, i);
-    }
-    in.close();
-    byte[] bytes = out.toByteArray();
-    Configuration conf = NutchConfiguration.create();
-
-    Content content = new Content(url, url, bytes, contentType, new Metadata(),
-        conf);
-    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-
-    Metadata metadata = parse.getData().getParseMeta();
-    Assert.assertEquals(license, metadata.get("License-Url"));
-    Assert.assertEquals(location, metadata.get("License-Location"));
-    Assert.assertEquals(type, metadata.get("Work-Type"));
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java
 
b/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java
new file mode 100644
index 0000000..36c8739
--- /dev/null
+++ 
b/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.feed;
+
+// JDK imports
+import java.util.Iterator;
+import java.util.Map;
+
+import org.junit.Assert;
+import org.junit.Test;
+// APACHE imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolNotFound;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * 
+ * @author mattmann
+ * 
+ *         Test Suite for the {@link FeedParser}.
+ * 
+ */
+public class TestFeedParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/feed/build.xml during plugin compilation.
+
+  private String[] sampleFiles = { "rsstest.rss" };
+
+  public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
+      .getName());
+
+  /**
+   * Calls the {@link FeedParser} on a sample RSS file and checks that there 
are
+   * 3 {@link ParseResult} entries including the below 2 links:
+   * <ul>
+   * <li>http://www-scf.usc.edu/~mattmann/</li>
+   * <li>http://www.nutch.org</li>
+   * </ul>
+   * 
+   * 
+   * @throws ProtocolNotFound
+   *           If the {@link Protocol}Layer cannot be loaded (required to fetch
+   *           the {@link Content} for the RSS file).
+   * @throws ParseException
+   *           If the {@link Parser}Layer cannot be loaded.
+   */
+  @Test
+  public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    ParseResult parseResult;
+
+    Configuration conf = NutchConfiguration.create();
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+      urlString = urlString.replace('\\', '/');
+
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+
+      parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
+
+      Assert.assertEquals(3, parseResult.size());
+
+      boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
+
+      for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j
+          .hasNext();) {
+        Map.Entry<Text, Parse> entry = j.next();
+        if (entry.getKey().toString()
+            .equals("http://www-scf.usc.edu/~mattmann/";)) {
+          hasLink1 = true;
+        } else if (entry.getKey().toString().equals("http://www.nutch.org/";)) {
+          hasLink2 = true;
+        } else if (entry.getKey().toString().equals(urlString)) {
+          hasLink3 = true;
+        }
+
+        Assert.assertNotNull(entry.getValue());
+        Assert.assertNotNull(entry.getValue().getData());
+      }
+
+      if (!hasLink1 || !hasLink2 || !hasLink3) {
+        Assert.fail("Outlinks read from sample rss file are not correct!");
+      }
+    }
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java 
b/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
deleted file mode 100644
index 36c8739..0000000
--- 
a/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.feed;
-
-// JDK imports
-import java.util.Iterator;
-import java.util.Map;
-
-import org.junit.Assert;
-import org.junit.Test;
-// APACHE imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.ProtocolNotFound;
-import org.apache.nutch.util.NutchConfiguration;
-
-/**
- * 
- * @author mattmann
- * 
- *         Test Suite for the {@link FeedParser}.
- * 
- */
-public class TestFeedParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-
-  // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/feed/build.xml during plugin compilation.
-
-  private String[] sampleFiles = { "rsstest.rss" };
-
-  public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
-      .getName());
-
-  /**
-   * Calls the {@link FeedParser} on a sample RSS file and checks that there 
are
-   * 3 {@link ParseResult} entries including the below 2 links:
-   * <ul>
-   * <li>http://www-scf.usc.edu/~mattmann/</li>
-   * <li>http://www.nutch.org</li>
-   * </ul>
-   * 
-   * 
-   * @throws ProtocolNotFound
-   *           If the {@link Protocol}Layer cannot be loaded (required to fetch
-   *           the {@link Content} for the RSS file).
-   * @throws ParseException
-   *           If the {@link Parser}Layer cannot be loaded.
-   */
-  @Test
-  public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
-    String urlString;
-    Protocol protocol;
-    Content content;
-    ParseResult parseResult;
-
-    Configuration conf = NutchConfiguration.create();
-    for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-      urlString = urlString.replace('\\', '/');
-
-      protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-
-      parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
-
-      Assert.assertEquals(3, parseResult.size());
-
-      boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
-
-      for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j
-          .hasNext();) {
-        Map.Entry<Text, Parse> entry = j.next();
-        if (entry.getKey().toString()
-            .equals("http://www-scf.usc.edu/~mattmann/";)) {
-          hasLink1 = true;
-        } else if (entry.getKey().toString().equals("http://www.nutch.org/";)) {
-          hasLink2 = true;
-        } else if (entry.getKey().toString().equals(urlString)) {
-          hasLink3 = true;
-        }
-
-        Assert.assertNotNull(entry.getValue());
-        Assert.assertNotNull(entry.getValue().getData());
-      }
-
-      if (!hasLink1 || !hasLink2 || !hasLink3) {
-        Assert.fail("Outlinks read from sample rss file are not correct!");
-      }
-    }
-
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
 
b/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
new file mode 100644
index 0000000..08a42f3
--- /dev/null
+++ 
b/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.anchor;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit test case which tests 1. that anchor text is obtained 2. that anchor
+ * deduplication functionality is working
+ * 
+ * @author lewismc
+ * 
+ */
+public class TestAnchorIndexingFilter {
+
+  @Test
+  public void testDeduplicateAnchor() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
+    AnchorIndexingFilter filter = new AnchorIndexingFilter();
+    filter.setConf(conf);
+    Assert.assertNotNull(filter);
+    NutchDocument doc = new NutchDocument();
+    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://test1.com/";, "text1"));
+    inlinks.add(new Inlink("http://test2.com/";, "text2"));
+    inlinks.add(new Inlink("http://test3.com/";, "text2"));
+    try {
+      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html";),
+          new CrawlDatum(), inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
+    }
+    Assert.assertNotNull(doc);
+    Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames()
+        .contains("anchor"));
+    Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor")
+        .getValues().size());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
 
b/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
deleted file mode 100644
index 08a42f3..0000000
--- 
a/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.anchor;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlink;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * JUnit test case which tests 1. that anchor text is obtained 2. that anchor
- * deduplication functionality is working
- * 
- * @author lewismc
- * 
- */
-public class TestAnchorIndexingFilter {
-
-  @Test
-  public void testDeduplicateAnchor() throws Exception {
-    Configuration conf = NutchConfiguration.create();
-    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
-    AnchorIndexingFilter filter = new AnchorIndexingFilter();
-    filter.setConf(conf);
-    Assert.assertNotNull(filter);
-    NutchDocument doc = new NutchDocument();
-    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
-    Inlinks inlinks = new Inlinks();
-    inlinks.add(new Inlink("http://test1.com/";, "text1"));
-    inlinks.add(new Inlink("http://test2.com/";, "text2"));
-    inlinks.add(new Inlink("http://test3.com/";, "text2"));
-    try {
-      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html";),
-          new CrawlDatum(), inlinks);
-    } catch (Exception e) {
-      e.printStackTrace();
-      Assert.fail(e.getMessage());
-    }
-    Assert.assertNotNull(doc);
-    Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames()
-        .contains("anchor"));
-    Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor")
-        .getValues().size());
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
 
b/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
new file mode 100644
index 0000000..4bc317e
--- /dev/null
+++ 
b/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.basic;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.basic.BasicIndexingFilter;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.Date;
+
+/**
+ * JUnit test case which tests 1. that basic searchable fields are added to a
+ * document 2. that domain is added as per {@code indexer.add.domain} in
+ * nutch-default.xml. 3. that title is truncated as per
+ * {@code indexer.max.title.length} in nutch-default.xml. 4. that content is
+ * truncated as per {@code indexer.max.content.length} in nutch-default.xml.
+ * 
+ * @author tejasp
+ * 
+ */
+
+public class TestBasicIndexingFilter {
+
+  @Test
+  public void testBasicIndexingFilter() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    conf.setInt("indexer.max.title.length", 10);
+    conf.setBoolean("indexer.add.domain", true);
+    conf.setInt("indexer.max.content.length", 20);
+
+    BasicIndexingFilter filter = new BasicIndexingFilter();
+    filter.setConf(conf);
+    Assert.assertNotNull(filter);
+
+    NutchDocument doc = new NutchDocument();
+
+    String title = "The Foo Page";
+    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/";, "Foo") 
};
+    Metadata metaData = new Metadata();
+    metaData.add("Language", "en/us");
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+        outlinks, metaData);
+    ParseImpl parse = new ParseImpl(
+        "this is a sample foo bar page. hope you enjoy it.", parseData);
+
+    CrawlDatum crawlDatum = new CrawlDatum();
+    crawlDatum.setFetchTime(100L);
+
+    Inlinks inlinks = new Inlinks();
+
+    try {
+      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html";),
+          crawlDatum, inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
+    }
+    Assert.assertNotNull(doc);
+    Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc
+        .getField("title").getValues().get(0));
+    Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc
+        .getField("domain").getValues().get(0));
+    Assert.assertEquals("test host, expect \"nutch.apache.org\"",
+        "nutch.apache.org", doc.getField("host").getValues().get(0));
+    Assert.assertEquals(
+        "test url, expect \"http://nutch.apache.org/index.html\"";,
+        "http://nutch.apache.org/index.html";, doc.getField("url").getValues()
+            .get(0));
+    Assert.assertEquals("test content", "this is a sample foo",
+        doc.getField("content").getValues().get(0));
+    Assert.assertEquals("test fetch time", new Date(100L),
+        (Date) doc.getField("tstamp").getValues().get(0));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
 
b/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
deleted file mode 100644
index 4bc317e..0000000
--- 
a/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.basic;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.basic.BasicIndexingFilter;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-import java.util.Date;
-
-/**
- * JUnit test case which tests 1. that basic searchable fields are added to a
- * document 2. that domain is added as per {@code indexer.add.domain} in
- * nutch-default.xml. 3. that title is truncated as per
- * {@code indexer.max.title.length} in nutch-default.xml. 4. that content is
- * truncated as per {@code indexer.max.content.length} in nutch-default.xml.
- * 
- * @author tejasp
- * 
- */
-
-public class TestBasicIndexingFilter {
-
-  @Test
-  public void testBasicIndexingFilter() throws Exception {
-    Configuration conf = NutchConfiguration.create();
-    conf.setInt("indexer.max.title.length", 10);
-    conf.setBoolean("indexer.add.domain", true);
-    conf.setInt("indexer.max.content.length", 20);
-
-    BasicIndexingFilter filter = new BasicIndexingFilter();
-    filter.setConf(conf);
-    Assert.assertNotNull(filter);
-
-    NutchDocument doc = new NutchDocument();
-
-    String title = "The Foo Page";
-    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/";, "Foo") 
};
-    Metadata metaData = new Metadata();
-    metaData.add("Language", "en/us");
-    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
-        outlinks, metaData);
-    ParseImpl parse = new ParseImpl(
-        "this is a sample foo bar page. hope you enjoy it.", parseData);
-
-    CrawlDatum crawlDatum = new CrawlDatum();
-    crawlDatum.setFetchTime(100L);
-
-    Inlinks inlinks = new Inlinks();
-
-    try {
-      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html";),
-          crawlDatum, inlinks);
-    } catch (Exception e) {
-      e.printStackTrace();
-      Assert.fail(e.getMessage());
-    }
-    Assert.assertNotNull(doc);
-    Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc
-        .getField("title").getValues().get(0));
-    Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc
-        .getField("domain").getValues().get(0));
-    Assert.assertEquals("test host, expect \"nutch.apache.org\"",
-        "nutch.apache.org", doc.getField("host").getValues().get(0));
-    Assert.assertEquals(
-        "test url, expect \"http://nutch.apache.org/index.html\"";,
-        "http://nutch.apache.org/index.html";, doc.getField("url").getValues()
-            .get(0));
-    Assert.assertEquals("test content", "this is a sample foo",
-        doc.getField("content").getValues().get(0));
-    Assert.assertEquals("test fetch time", new Date(100L),
-        (Date) doc.getField("tstamp").getValues().get(0));
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
 
b/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
new file mode 100644
index 0000000..c490d1f
--- /dev/null
+++ 
b/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
@@ -0,0 +1,218 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.links;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.net.URL;
+import java.util.Iterator;
+
+public class TestLinksIndexingFilter {
+
+  Configuration conf = NutchConfiguration.create();
+  LinksIndexingFilter filter = new LinksIndexingFilter();
+  Metadata metadata = new Metadata();
+
+  @Before
+  public void setUp() throws Exception {
+    metadata.add(Response.CONTENT_TYPE, "text/html");
+  }
+
+  private Outlink[] generateOutlinks() throws Exception {
+    return generateOutlinks(false);
+  }
+
+  private Outlink[] generateOutlinks(boolean parts) throws Exception {
+    Outlink[] outlinks = new Outlink[2];
+
+    outlinks[0] = new Outlink("http://www.test.com";, "test");
+    outlinks[1] = new Outlink("http://www.example.com";, "example");
+
+    if (parts) {
+      outlinks[0] = new Outlink(outlinks[0].getToUrl() + "/index.php?param=1",
+          "test");
+      outlinks[1] = new Outlink(outlinks[1].getToUrl() + "/index.php?param=2",
+          "test");
+    }
+
+    return outlinks;
+  }
+
+  @Test
+  public void testFilterOutlinks() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+    filter.setConf(conf);
+
+    Outlink[] outlinks = generateOutlinks();
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/";), new CrawlDatum(), new Inlinks());
+
+    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
+
+    Assert.assertEquals("Filter outlinks, allow only those from a different 
host",
+        outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
+  }
+
+  @Test
+  public void testFilterInlinks() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+    filter.setConf(conf);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com";, "test"));
+    inlinks.add(new Inlink("http://www.example.com";, "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", new Outlink[0], 
metadata)),
+        new Text("http://www.example.com/";), new CrawlDatum(), inlinks);
+
+    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
+
+    Assert.assertEquals("Filter inlinks, allow only those from a different 
host",
+        "http://www.test.com";, doc.getFieldValue("inlinks"));
+  }
+
+  @Test
+  public void testNoFilterOutlinks() throws Exception {
+    filter.setConf(conf);
+
+    Outlink[] outlinks = generateOutlinks();
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/";), new CrawlDatum(), new Inlinks());
+
+    Assert.assertEquals("All outlinks must be indexed even those from the same 
host",
+        outlinks.length, doc.getField("outlinks").getValues().size());
+  }
+
+  @Test
+  public void testNoFilterInlinks() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
+    filter.setConf(conf);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com";, "test"));
+    inlinks.add(new Inlink("http://www.example.com";, "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", new Outlink[0], 
metadata)),
+        new Text("http://www.example.com/";), new CrawlDatum(), inlinks);
+
+    Assert.assertEquals("All inlinks must be indexed even those from the same 
host",
+        inlinks.size(), doc.getField("inlinks").getValues().size());
+  }
+
+  @Test
+  public void testIndexOnlyHostPart() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+    filter.setConf(conf);
+
+    Outlink[] outlinks = generateOutlinks(true);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com/one-awesome-page";, "test"));
+    inlinks.add(new Inlink("http://www.test.com/other-awesome-page";, "test"));
+    inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example";,
+        "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/";), new CrawlDatum(), inlinks);
+
+    NutchField docOutlinks = doc.getField("outlinks");
+
+    Assert.assertEquals("Only the host portion of the outlink URL must be 
indexed",
+        new URL("http://www.test.com";).getHost(),
+        docOutlinks.getValues().get(0));
+
+    Assert.assertEquals(
+        "The inlinks coming from the same host must count only once", 1,
+        doc.getField("inlinks").getValues().size());
+
+    Assert.assertEquals("Only the host portion of the inlinks URL must be 
indexed",
+        new URL("http://www.test.com";).getHost(), 
doc.getFieldValue("inlinks"));
+  }
+
+  @Test
+  public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+
+    Outlink[] outlinks = generateOutlinks(true);
+
+    filter.setConf(conf);
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/";), new CrawlDatum(), new Inlinks());
+
+    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
+
+    Assert.assertEquals(
+        "Index only the host portion of the outlinks after filtering",
+        new URL("http://www.test.com";).getHost(),
+        doc.getFieldValue("outlinks"));
+  }
+
+  @Test
+  public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+
+    filter.setConf(conf);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com";, "test"));
+    inlinks.add(new Inlink("http://www.example.com";, "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", new Outlink[0], 
metadata)),
+        new Text("http://www.example.com/";), new CrawlDatum(), inlinks);
+
+    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
+
+    Assert.assertEquals(
+        "Index only the host portion of the inlinks after filtering",
+        new URL("http://www.test.com";).getHost(),
+        doc.getFieldValue("inlinks"));
+
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java
 
b/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java
new file mode 100644
index 0000000..aaaedbf
--- /dev/null
+++ 
b/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.junit.Test;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.*;
+
+public class TestOutlinks {
+
+  @Test
+  public void testAddSameObject() throws Exception {
+    Set<Outlink> set = new HashSet<>();
+
+    Outlink o = new Outlink("http://www.example.com";, "Example");
+    set.add(o);
+    set.add(o);
+
+    assertEquals("Adding the same Outlink twice", 1, set.size());
+  }
+
+  @Test
+  public void testAddOtherObjectWithSameData() throws Exception {
+    Set<Outlink> set = new HashSet<>();
+
+    Outlink o = new Outlink("http://www.example.com";, "Example");
+    Outlink o1 = new Outlink("http://www.example.com";, "Example");
+
+    assertTrue("The two Outlink objects are the same", o.equals(o1));
+
+    set.add(o);
+    set.add(o1);
+
+    assertEquals("The set should contain only 1 Outlink", 1, set.size());
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
 
b/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
deleted file mode 100644
index c490d1f..0000000
--- 
a/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
+++ /dev/null
@@ -1,218 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.links;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlink;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.NutchField;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.net.URL;
-import java.util.Iterator;
-
-public class TestLinksIndexingFilter {
-
-  Configuration conf = NutchConfiguration.create();
-  LinksIndexingFilter filter = new LinksIndexingFilter();
-  Metadata metadata = new Metadata();
-
-  @Before
-  public void setUp() throws Exception {
-    metadata.add(Response.CONTENT_TYPE, "text/html");
-  }
-
-  private Outlink[] generateOutlinks() throws Exception {
-    return generateOutlinks(false);
-  }
-
-  private Outlink[] generateOutlinks(boolean parts) throws Exception {
-    Outlink[] outlinks = new Outlink[2];
-
-    outlinks[0] = new Outlink("http://www.test.com";, "test");
-    outlinks[1] = new Outlink("http://www.example.com";, "example");
-
-    if (parts) {
-      outlinks[0] = new Outlink(outlinks[0].getToUrl() + "/index.php?param=1",
-          "test");
-      outlinks[1] = new Outlink(outlinks[1].getToUrl() + "/index.php?param=2",
-          "test");
-    }
-
-    return outlinks;
-  }
-
-  @Test
-  public void testFilterOutlinks() throws Exception {
-    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
-    filter.setConf(conf);
-
-    Outlink[] outlinks = generateOutlinks();
-
-    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
-            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
-        new Text("http://www.example.com/";), new CrawlDatum(), new Inlinks());
-
-    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
-
-    Assert.assertEquals("Filter outlinks, allow only those from a different 
host",
-        outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
-  }
-
-  @Test
-  public void testFilterInlinks() throws Exception {
-    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
-    filter.setConf(conf);
-
-    Inlinks inlinks = new Inlinks();
-    inlinks.add(new Inlink("http://www.test.com";, "test"));
-    inlinks.add(new Inlink("http://www.example.com";, "example"));
-
-    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
-            new ParseData(new ParseStatus(), "title", new Outlink[0], 
metadata)),
-        new Text("http://www.example.com/";), new CrawlDatum(), inlinks);
-
-    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
-
-    Assert.assertEquals("Filter inlinks, allow only those from a different 
host",
-        "http://www.test.com";, doc.getFieldValue("inlinks"));
-  }
-
-  @Test
-  public void testNoFilterOutlinks() throws Exception {
-    filter.setConf(conf);
-
-    Outlink[] outlinks = generateOutlinks();
-
-    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
-            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
-        new Text("http://www.example.com/";), new CrawlDatum(), new Inlinks());
-
-    Assert.assertEquals("All outlinks must be indexed even those from the same 
host",
-        outlinks.length, doc.getField("outlinks").getValues().size());
-  }
-
-  @Test
-  public void testNoFilterInlinks() throws Exception {
-    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
-    filter.setConf(conf);
-
-    Inlinks inlinks = new Inlinks();
-    inlinks.add(new Inlink("http://www.test.com";, "test"));
-    inlinks.add(new Inlink("http://www.example.com";, "example"));
-
-    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
-            new ParseData(new ParseStatus(), "title", new Outlink[0], 
metadata)),
-        new Text("http://www.example.com/";), new CrawlDatum(), inlinks);
-
-    Assert.assertEquals("All inlinks must be indexed even those from the same 
host",
-        inlinks.size(), doc.getField("inlinks").getValues().size());
-  }
-
-  @Test
-  public void testIndexOnlyHostPart() throws Exception {
-    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
-    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
-    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
-    filter.setConf(conf);
-
-    Outlink[] outlinks = generateOutlinks(true);
-
-    Inlinks inlinks = new Inlinks();
-    inlinks.add(new Inlink("http://www.test.com/one-awesome-page";, "test"));
-    inlinks.add(new Inlink("http://www.test.com/other-awesome-page";, "test"));
-    inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example";,
-        "example"));
-
-    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
-            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
-        new Text("http://www.example.com/";), new CrawlDatum(), inlinks);
-
-    NutchField docOutlinks = doc.getField("outlinks");
-
-    Assert.assertEquals("Only the host portion of the outlink URL must be 
indexed",
-        new URL("http://www.test.com";).getHost(),
-        docOutlinks.getValues().get(0));
-
-    Assert.assertEquals(
-        "The inlinks coming from the same host must count only once", 1,
-        doc.getField("inlinks").getValues().size());
-
-    Assert.assertEquals("Only the host portion of the inlinks URL must be 
indexed",
-        new URL("http://www.test.com";).getHost(), 
doc.getFieldValue("inlinks"));
-  }
-
-  @Test
-  public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
-    conf = NutchConfiguration.create();
-    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
-    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
-
-    Outlink[] outlinks = generateOutlinks(true);
-
-    filter.setConf(conf);
-
-    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
-            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
-        new Text("http://www.example.com/";), new CrawlDatum(), new Inlinks());
-
-    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
-
-    Assert.assertEquals(
-        "Index only the host portion of the outlinks after filtering",
-        new URL("http://www.test.com";).getHost(),
-        doc.getFieldValue("outlinks"));
-  }
-
-  @Test
-  public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
-    conf = NutchConfiguration.create();
-    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
-    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
-
-    filter.setConf(conf);
-
-    Inlinks inlinks = new Inlinks();
-    inlinks.add(new Inlink("http://www.test.com";, "test"));
-    inlinks.add(new Inlink("http://www.example.com";, "example"));
-
-    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
-            new ParseData(new ParseStatus(), "title", new Outlink[0], 
metadata)),
-        new Text("http://www.example.com/";), new CrawlDatum(), inlinks);
-
-    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
-
-    Assert.assertEquals(
-        "Index only the host portion of the inlinks after filtering",
-        new URL("http://www.test.com";).getHost(),
-        doc.getFieldValue("inlinks"));
-
-  }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java 
b/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
deleted file mode 100644
index aaaedbf..0000000
--- 
a/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse;
-
-import org.junit.Test;
-
-import java.util.HashSet;
-import java.util.Set;
-
-import static org.junit.Assert.*;
-
-public class TestOutlinks {
-
-  @Test
-  public void testAddSameObject() throws Exception {
-    Set<Outlink> set = new HashSet<>();
-
-    Outlink o = new Outlink("http://www.example.com";, "Example");
-    set.add(o);
-    set.add(o);
-
-    assertEquals("Adding the same Outlink twice", 1, set.size());
-  }
-
-  @Test
-  public void testAddOtherObjectWithSameData() throws Exception {
-    Set<Outlink> set = new HashSet<>();
-
-    Outlink o = new Outlink("http://www.example.com";, "Example");
-    Outlink o1 = new Outlink("http://www.example.com";, "Example");
-
-    assertTrue("The two Outlink objects are the same", o.equals(o1));
-
-    set.add(o);
-    set.add(o1);
-
-    assertEquals("The set should contain only 1 Outlink", 1, set.size());
-  }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
 
b/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
new file mode 100644
index 0000000..f918dde
--- /dev/null
+++ 
b/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.more;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestMoreIndexingFilter {
+
+  @Test
+  public void testContentType() throws IndexingException {
+    Configuration conf = NutchConfiguration.create();
+    assertContentType(conf, "text/html", "text/html");
+    assertContentType(conf, "text/html; charset=UTF-8", "text/html");
+  }
+
+  @Test
+  public void testGetParts() {
+    String[] parts = MoreIndexingFilter.getParts("text/html");
+    assertParts(parts, 2, "text", "html");
+  }
+
+  /**
+   * @since NUTCH-901
+   */
+  @Test
+  public void testNoParts() {
+    Configuration conf = NutchConfiguration.create();
+    conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
+    MoreIndexingFilter filter = new MoreIndexingFilter();
+    filter.setConf(conf);
+    Assert.assertNotNull(filter);
+    NutchDocument doc = new NutchDocument();
+    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
+
+    try {
+      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html";),
+          new CrawlDatum(), new Inlinks());
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
+    }
+    Assert.assertNotNull(doc);
+    Assert.assertTrue(doc.getFieldNames().contains("type"));
+    Assert.assertEquals(1, doc.getField("type").getValues().size());
+    Assert.assertEquals("text/html", doc.getFieldValue("type"));
+  }
+
+  @Test
+  public void testContentDispositionTitle() throws IndexingException {
+    Configuration conf = NutchConfiguration.create();
+
+    Metadata metadata = new Metadata();
+    metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
+    MoreIndexingFilter filter = new MoreIndexingFilter();
+    filter.setConf(conf);
+
+    Text url = new Text("http://www.example.com/";);
+    ParseImpl parseImpl = new ParseImpl("text", new ParseData(
+        new ParseStatus(), "title", new Outlink[0], metadata));
+
+    NutchDocument doc = new NutchDocument();
+    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
+
+    Assert.assertEquals("content-disposition not detected", "filename.ext",
+        doc.getFieldValue("title"));
+
+    /* NUTCH-1140: do not add second title to avoid a multi-valued title field 
*/
+    doc = new NutchDocument();
+    doc.add("title", "title");
+    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
+    Assert.assertEquals("do not add second title by content-disposition",
+        "title", doc.getFieldValue("title"));
+  }
+
+  private void assertParts(String[] parts, int count, String... expected) {
+    Assert.assertEquals(count, parts.length);
+    for (int i = 0; i < expected.length; i++) {
+      Assert.assertEquals(expected[i], parts[i]);
+    }
+  }
+
+  private void assertContentType(Configuration conf, String source,
+      String expected) throws IndexingException {
+    Metadata metadata = new Metadata();
+    metadata.add(Response.CONTENT_TYPE, source);
+    MoreIndexingFilter filter = new MoreIndexingFilter();
+    filter.setConf(conf);
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl(
+        "text", new ParseData(new ParseStatus(), "title", new Outlink[0],
+            metadata)), new Text("http://www.example.com/";), new CrawlDatum(),
+        new Inlinks());
+    Assert.assertEquals("mime type not detected", expected,
+        doc.getFieldValue("type"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
 
b/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
deleted file mode 100644
index f918dde..0000000
--- 
a/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.more;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestMoreIndexingFilter {
-
-  @Test
-  public void testContentType() throws IndexingException {
-    Configuration conf = NutchConfiguration.create();
-    assertContentType(conf, "text/html", "text/html");
-    assertContentType(conf, "text/html; charset=UTF-8", "text/html");
-  }
-
-  @Test
-  public void testGetParts() {
-    String[] parts = MoreIndexingFilter.getParts("text/html");
-    assertParts(parts, 2, "text", "html");
-  }
-
-  /**
-   * @since NUTCH-901
-   */
-  @Test
-  public void testNoParts() {
-    Configuration conf = NutchConfiguration.create();
-    conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
-    MoreIndexingFilter filter = new MoreIndexingFilter();
-    filter.setConf(conf);
-    Assert.assertNotNull(filter);
-    NutchDocument doc = new NutchDocument();
-    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
-
-    try {
-      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html";),
-          new CrawlDatum(), new Inlinks());
-    } catch (Exception e) {
-      e.printStackTrace();
-      Assert.fail(e.getMessage());
-    }
-    Assert.assertNotNull(doc);
-    Assert.assertTrue(doc.getFieldNames().contains("type"));
-    Assert.assertEquals(1, doc.getField("type").getValues().size());
-    Assert.assertEquals("text/html", doc.getFieldValue("type"));
-  }
-
-  @Test
-  public void testContentDispositionTitle() throws IndexingException {
-    Configuration conf = NutchConfiguration.create();
-
-    Metadata metadata = new Metadata();
-    metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
-    MoreIndexingFilter filter = new MoreIndexingFilter();
-    filter.setConf(conf);
-
-    Text url = new Text("http://www.example.com/";);
-    ParseImpl parseImpl = new ParseImpl("text", new ParseData(
-        new ParseStatus(), "title", new Outlink[0], metadata));
-
-    NutchDocument doc = new NutchDocument();
-    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
-
-    Assert.assertEquals("content-disposition not detected", "filename.ext",
-        doc.getFieldValue("title"));
-
-    /* NUTCH-1140: do not add second title to avoid a multi-valued title field 
*/
-    doc = new NutchDocument();
-    doc.add("title", "title");
-    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
-    Assert.assertEquals("do not add second title by content-disposition",
-        "title", doc.getFieldValue("title"));
-  }
-
-  private void assertParts(String[] parts, int count, String... expected) {
-    Assert.assertEquals(count, parts.length);
-    for (int i = 0; i < expected.length; i++) {
-      Assert.assertEquals(expected[i], parts[i]);
-    }
-  }
-
-  private void assertContentType(Configuration conf, String source,
-      String expected) throws IndexingException {
-    Metadata metadata = new Metadata();
-    metadata.add(Response.CONTENT_TYPE, source);
-    MoreIndexingFilter filter = new MoreIndexingFilter();
-    filter.setConf(conf);
-    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl(
-        "text", new ParseData(new ParseStatus(), "title", new Outlink[0],
-            metadata)), new Text("http://www.example.com/";), new CrawlDatum(),
-        new Inlinks());
-    Assert.assertEquals("mime type not detected", expected,
-        doc.getFieldValue("type"));
-  }
-}

[66/69] [abbrv] nutch git commit: Moved test sources to maven standard directory

Reply via email to