[28/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

thammegowda Sat, 16 Jul 2016 12:48:43 -0700

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
 
b/nutch-core/src/test/java/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
new file mode 100644
index 0000000..ae73ae1
--- /dev/null
+++ 
b/nutch-core/src/test/java/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
@@ -0,0 +1,303 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.Properties;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit based tests of class
+ * {@link org.apache.nutch.metadata.SpellCheckedMetadata}.
+ * 
+ * @author Chris Mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class TestSpellCheckedMetadata {
+
+  private static final int NUM_ITERATIONS = 10000;
+
+  /** Test for the <code>getNormalizedName(String)</code> method. */
+  @Test
+  public void testGetNormalizedName() {
+    Assert.assertEquals("Content-Type",
+        SpellCheckedMetadata.getNormalizedName("Content-Type"));
+    Assert.assertEquals("Content-Type",
+        SpellCheckedMetadata.getNormalizedName("ContentType"));
+    Assert.assertEquals("Content-Type",
+        SpellCheckedMetadata.getNormalizedName("Content-type"));
+    Assert.assertEquals("Content-Type",
+        SpellCheckedMetadata.getNormalizedName("contenttype"));
+    Assert.assertEquals("Content-Type",
+        SpellCheckedMetadata.getNormalizedName("contentype"));
+    Assert.assertEquals("Content-Type",
+        SpellCheckedMetadata.getNormalizedName("contntype"));
+  }
+
+  /** Test for the <code>add(String, String)</code> method. */
+  @Test
+  public void testAdd() {
+    String[] values = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+
+    values = meta.getValues("contentype");
+    Assert.assertEquals(0, values.length);
+
+    meta.add("contentype", "value1");
+    values = meta.getValues("contentype");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1", values[0]);
+
+    meta.add("Content-Type", "value2");
+    values = meta.getValues("contentype");
+    Assert.assertEquals(2, values.length);
+    Assert.assertEquals("value1", values[0]);
+    Assert.assertEquals("value2", values[1]);
+
+    // NOTE : For now, the same value can be added many times.
+    // Should it be changed?
+    meta.add("ContentType", "value1");
+    values = meta.getValues("Content-Type");
+    Assert.assertEquals(3, values.length);
+    Assert.assertEquals("value1", values[0]);
+    Assert.assertEquals("value2", values[1]);
+    Assert.assertEquals("value1", values[2]);
+  }
+
+  /** Test for the <code>set(String, String)</code> method. */
+  @Test
+  public void testSet() {
+    String[] values = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+
+    values = meta.getValues("contentype");
+    Assert.assertEquals(0, values.length);
+
+    meta.set("contentype", "value1");
+    values = meta.getValues("contentype");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1", values[0]);
+
+    meta.set("Content-Type", "value2");
+    values = meta.getValues("contentype");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value2", values[0]);
+
+    meta.set("contenttype", "new value 1");
+    meta.add("contenttype", "new value 2");
+    values = meta.getValues("contentype");
+    Assert.assertEquals(2, values.length);
+    Assert.assertEquals("new value 1", values[0]);
+    Assert.assertEquals("new value 2", values[1]);
+  }
+
+  /** Test for <code>setAll(Properties)</code> method. */
+  @Test
+  public void testSetProperties() {
+    String[] values = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    Properties props = new Properties();
+
+    meta.setAll(props);
+    Assert.assertEquals(0, meta.size());
+
+    props.setProperty("name-one", "value1.1");
+    meta.setAll(props);
+    Assert.assertEquals(1, meta.size());
+    values = meta.getValues("name-one");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1.1", values[0]);
+
+    props.setProperty("name-two", "value2.1");
+    meta.setAll(props);
+    Assert.assertEquals(2, meta.size());
+    values = meta.getValues("name-one");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1.1", values[0]);
+    values = meta.getValues("name-two");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value2.1", values[0]);
+  }
+
+  /** Test for <code>get(String)</code> method. */
+  @Test
+  public void testGet() {
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    Assert.assertNull(meta.get("a-name"));
+
+    meta.add("a-name", "value-1");
+    Assert.assertEquals("value-1", meta.get("a-name"));
+    meta.add("a-name", "value-2");
+    Assert.assertEquals("value-1", meta.get("a-name"));
+  }
+
+  /** Test for <code>isMultiValued()</code> method. */
+  @Test
+  public void testIsMultiValued() {
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    Assert.assertFalse(meta.isMultiValued("key"));
+    meta.add("key", "value1");
+    Assert.assertFalse(meta.isMultiValued("key"));
+    meta.add("key", "value2");
+    Assert.assertTrue(meta.isMultiValued("key"));
+  }
+
+  /** Test for <code>names</code> method. */
+  @Test
+  public void testNames() {
+    String[] names = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    names = meta.names();
+    Assert.assertEquals(0, names.length);
+
+    meta.add("name-one", "value");
+    names = meta.names();
+    Assert.assertEquals(1, names.length);
+    Assert.assertEquals("name-one", names[0]);
+    meta.add("name-two", "value");
+    names = meta.names();
+    Assert.assertEquals(2, names.length);
+  }
+
+  /** Test for <code>remove(String)</code> method. */
+  @Test
+  public void testRemove() {
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    meta.remove("name-one");
+    Assert.assertEquals(0, meta.size());
+    meta.add("name-one", "value-1.1");
+    meta.add("name-one", "value-1.2");
+    meta.add("name-two", "value-2.2");
+    Assert.assertEquals(2, meta.size());
+    Assert.assertNotNull(meta.get("name-one"));
+    Assert.assertNotNull(meta.get("name-two"));
+    meta.remove("name-one");
+    Assert.assertEquals(1, meta.size());
+    Assert.assertNull(meta.get("name-one"));
+    Assert.assertNotNull(meta.get("name-two"));
+    meta.remove("name-two");
+    Assert.assertEquals(0, meta.size());
+    Assert.assertNull(meta.get("name-one"));
+    Assert.assertNull(meta.get("name-two"));
+  }
+
+  /** Test for <code>equals(Object)</code> method. */
+  @Test
+  public void testObject() {
+    SpellCheckedMetadata meta1 = new SpellCheckedMetadata();
+    SpellCheckedMetadata meta2 = new SpellCheckedMetadata();
+    Assert.assertFalse(meta1.equals(null));
+    Assert.assertFalse(meta1.equals("String"));
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-one", "value-1.1");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-one", "value-1.1");
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-one", "value-1.2");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-one", "value-1.2");
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-two", "value-2.1");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-two", "value-2.1");
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-two", "value-2.2");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-two", "value-2.x");
+    Assert.assertFalse(meta1.equals(meta2));
+  }
+
+  /** Test for <code>Writable</code> implementation. */
+  @Test
+  public void testWritable() {
+    SpellCheckedMetadata result = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    result = writeRead(meta);
+    Assert.assertEquals(0, result.size());
+    meta.add("name-one", "value-1.1");
+    result = writeRead(meta);
+    meta.add("Contenttype", "text/html");
+    Assert.assertEquals(1, result.size());
+    Assert.assertEquals(1, result.getValues("name-one").length);
+    Assert.assertEquals("value-1.1", result.get("name-one"));
+    meta.add("name-two", "value-2.1");
+    meta.add("name-two", "value-2.2");
+    result = writeRead(meta);
+    Assert.assertEquals(3, result.size());
+    Assert.assertEquals(1, result.getValues("name-one").length);
+    Assert.assertEquals("value-1.1", result.getValues("name-one")[0]);
+    Assert.assertEquals(2, result.getValues("name-two").length);
+    Assert.assertEquals("value-2.1", result.getValues("name-two")[0]);
+    Assert.assertEquals("value-2.2", result.getValues("name-two")[1]);
+    Assert.assertEquals("text/html", result.get(Metadata.CONTENT_TYPE));
+  }
+
+  /**
+   * IO Test method, usable only when you plan to do changes in metadata to
+   * measure relative performance impact.
+   */
+  @Test
+  public final void testHandlingSpeed() {
+    @SuppressWarnings("unused")
+    SpellCheckedMetadata result;
+    long start = System.currentTimeMillis();
+    for (int i = 0; i < NUM_ITERATIONS; i++) {
+      SpellCheckedMetadata scmd = constructSpellCheckedMetadata();
+      result = writeRead(scmd);
+    }
+    System.out.println(NUM_ITERATIONS + " spellchecked metadata I/O time:"
+        + (System.currentTimeMillis() - start) + "ms.");
+  }
+
+  private SpellCheckedMetadata writeRead(SpellCheckedMetadata meta) {
+    SpellCheckedMetadata readed = new SpellCheckedMetadata();
+    try {
+      ByteArrayOutputStream out = new ByteArrayOutputStream();
+      meta.write(new DataOutputStream(out));
+      readed.readFields(new DataInputStream(new ByteArrayInputStream(out
+          .toByteArray())));
+    } catch (IOException ioe) {
+      Assert.fail(ioe.toString());
+    }
+    return readed;
+  }
+
+  /**
+   * Assembles a Spellchecked metadata Object.
+   */
+  public static final SpellCheckedMetadata constructSpellCheckedMetadata() {
+    SpellCheckedMetadata scmd = new SpellCheckedMetadata();
+    scmd.add("Content-type", "foo/bar");
+    scmd.add("Connection", "close");
+    scmd.add("Last-Modified", "Sat, 09 Dec 2006 15:09:57 GMT");
+    scmd.add("Server", "Foobar");
+    scmd.add("Date", "Sat, 09 Dec 2006 18:07:20 GMT");
+    scmd.add("Accept-Ranges", "bytes");
+    scmd.add("ETag", "\"1234567-89-01234567\"");
+    scmd.add("Content-Length", "123");
+    scmd.add(Nutch.SEGMENT_NAME_KEY, "segmentzzz");
+    scmd.add(Nutch.SIGNATURE_KEY, "123");
+    return scmd;
+  }
+
+}


http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java 
b/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java
new file mode 100644
index 0000000..ef07907
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category(IntegrationTest.class)
+public class TestURLFilters {
+
+  /**
+   * Testcase for NUTCH-325.
+   * 
+   * @throws URLFilterException
+   */
+  @Test
+  public void testNonExistingUrlFilter() throws URLFilterException {
+    Configuration conf = NutchConfiguration.create();
+    String class1 = "NonExistingFilter";
+    String class2 = "org.apache.nutch.urlfilter.prefix.PrefixURLFilter";
+    conf.set(URLFilters.URLFILTER_ORDER, class1 + " " + class2);
+
+    URLFilters normalizers = new URLFilters(conf);
+    normalizers.filter("http://someurl/";);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java 
b/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java
new file mode 100644
index 0000000..d29e9d3
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category(IntegrationTest.class)
+public class TestURLNormalizers {
+
+  @Test
+  public void testURLNormalizers() {
+    Configuration conf = NutchConfiguration.create();
+    String clazz1 = 
"org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer";
+    String clazz2 = 
"org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer";
+    conf.set("urlnormalizer.order", clazz1 + " " + clazz2);
+
+    URLNormalizers normalizers = new URLNormalizers(conf,
+        URLNormalizers.SCOPE_DEFAULT);
+
+    Assert.assertNotNull(normalizers);
+    try {
+      normalizers.normalize("http://www.example.com/";,
+          URLNormalizers.SCOPE_DEFAULT);
+    } catch (MalformedURLException mue) {
+      Assert.fail(mue.toString());
+    }
+
+    // NUTCH-1011 - Get rid of superfluous slashes
+    try {
+      String normalizedSlashes = normalizers.normalize(
+          "http://www.example.com//path/to//somewhere.html";,
+          URLNormalizers.SCOPE_DEFAULT);
+      Assert.assertEquals(normalizedSlashes,
+          "http://www.example.com/path/to/somewhere.html";);
+    } catch (MalformedURLException mue) {
+      Assert.fail(mue.toString());
+    }
+
+    // HostNormalizer NUTCH-1319
+    try {
+      String normalizedHost = normalizers.normalize(
+          "http://www.example.org//path/to//somewhere.html";,
+          URLNormalizers.SCOPE_DEFAULT);
+      Assert.assertEquals(normalizedHost,
+          "http://www.example.org/path/to/somewhere.html";);
+    } catch (MalformedURLException mue) {
+      Assert.fail(mue.toString());
+    }
+
+    // check the order
+    int pos1 = -1, pos2 = -1;
+    URLNormalizer[] impls = normalizers
+        .getURLNormalizers(URLNormalizers.SCOPE_DEFAULT);
+    for (int i = 0; i < impls.length; i++) {
+      if (impls[i].getClass().getName().equals(clazz1))
+        pos1 = i;
+      if (impls[i].getClass().getName().equals(clazz2))
+        pos2 = i;
+    }
+    if (pos1 != -1 && pos2 != -1) {
+      Assert.assertTrue("RegexURLNormalizer before BasicURLNormalizer",
+          pos1 < pos2);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/parse/TestOutlinkExtractor.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/parse/TestOutlinkExtractor.java 
b/nutch-core/src/test/java/org/apache/nutch/parse/TestOutlinkExtractor.java
new file mode 100644
index 0000000..1f2c833
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/parse/TestOutlinkExtractor.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * TestCase to check regExp extraction of URLs.
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * 
+ * @version 1.0
+ */
+public class TestOutlinkExtractor {
+
+  private static Configuration conf = NutchConfiguration.create();
+
+  @Test
+  public void testGetNoOutlinks() {
+    Outlink[] outlinks = null;
+
+    outlinks = OutlinkExtractor.getOutlinks(null, conf);
+    Assert.assertNotNull(outlinks);
+    Assert.assertEquals(0, outlinks.length);
+
+    outlinks = OutlinkExtractor.getOutlinks("", conf);
+    Assert.assertNotNull(outlinks);
+    Assert.assertEquals(0, outlinks.length);
+  }
+
+  @Test
+  public void testGetOutlinksHttp() {
+    Outlink[] outlinks = OutlinkExtractor
+        .getOutlinks(
+            "Test with http://www.nutch.org/index.html is it found? "
+                + "What about www.google.com at http://www.google.de "
+                + "A longer URL could be 
http://www.sybit.com/solutions/portals.html";,
+            conf);
+
+    Assert.assertTrue("Url not found!", outlinks.length == 3);
+    Assert.assertEquals("Wrong URL", "http://www.nutch.org/index.html";,
+        outlinks[0].getToUrl());
+    Assert.assertEquals("Wrong URL", "http://www.google.de";,
+        outlinks[1].getToUrl());
+    Assert.assertEquals("Wrong URL",
+        "http://www.sybit.com/solutions/portals.html";, outlinks[2].getToUrl());
+  }
+
+  @Test
+  public void testGetOutlinksHttp2() {
+    Outlink[] outlinks = OutlinkExtractor
+        .getOutlinks(
+            "Test with http://www.nutch.org/index.html is it found? "
+                + "What about www.google.com at http://www.google.de "
+                + "A longer URL could be 
http://www.sybit.com/solutions/portals.html";,
+            "http://www.sybit.de";, conf);
+
+    Assert.assertTrue("Url not found!", outlinks.length == 3);
+    Assert.assertEquals("Wrong URL", "http://www.nutch.org/index.html";,
+        outlinks[0].getToUrl());
+    Assert.assertEquals("Wrong URL", "http://www.google.de";,
+        outlinks[1].getToUrl());
+    Assert.assertEquals("Wrong URL",
+        "http://www.sybit.com/solutions/portals.html";, outlinks[2].getToUrl());
+  }
+
+  @Test
+  public void testGetOutlinksFtp() {
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(
+        "Test with ftp://www.nutch.org is it found? "
+            + "What about www.google.com at ftp://www.google.de";, conf);
+
+    Assert.assertTrue("Url not found!", outlinks.length > 1);
+    Assert.assertEquals("Wrong URL", "ftp://www.nutch.org";,
+        outlinks[0].getToUrl());
+    Assert.assertEquals("Wrong URL", "ftp://www.google.de";,
+        outlinks[1].getToUrl());
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/parse/TestParseData.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/parse/TestParseData.java 
b/nutch-core/src/test/java/org/apache/nutch/parse/TestParseData.java
new file mode 100644
index 0000000..550a260
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/parse/TestParseData.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.nutch.util.WritableTestUtils;
+import org.apache.nutch.metadata.Metadata;
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for ParseData. */
+
+public class TestParseData {
+
+  @Test
+  public void testParseData() throws Exception {
+
+    String title = "The Foo Page";
+
+    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/";, "Foo"),
+        new Outlink("http://bar.com/";, "Bar") };
+
+    Metadata metaData = new Metadata();
+    metaData.add("Language", "en/us");
+    metaData.add("Charset", "UTF-8");
+
+    ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
+        metaData);
+
+    WritableTestUtils.testWritable(r, null);
+  }
+
+  @Test
+  public void testMaxOutlinks() throws Exception {
+    Outlink[] outlinks = new Outlink[128];
+    for (int i = 0; i < outlinks.length; i++) {
+      outlinks[i] = new Outlink("http://outlink.com/"; + i, "Outlink" + i);
+    }
+    ParseData original = new ParseData(ParseStatus.STATUS_SUCCESS,
+        "Max Outlinks Title", outlinks, new Metadata());
+    ParseData data = (ParseData) WritableTestUtils.writeRead(original, null);
+    Assert.assertEquals(outlinks.length, data.getOutlinks().length);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/parse/TestParseText.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/parse/TestParseText.java 
b/nutch-core/src/test/java/org/apache/nutch/parse/TestParseText.java
new file mode 100644
index 0000000..241b293
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/parse/TestParseText.java
@@ -0,0 +1,34 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.nutch.util.WritableTestUtils;
+import org.junit.Test;
+
+/** Unit tests for ParseText. */
+
+public class TestParseText {
+
+  @Test
+  public void testParseText() throws Exception {
+
+    String page = "Hello World The Quick Brown Fox Jumped Over the Lazy Fox";
+    ParseText s = new ParseText(page);
+    WritableTestUtils.testWritable(s);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java 
b/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java
new file mode 100644
index 0000000..198e284
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+// Nutch imports
+import org.apache.nutch.plugin.Extension;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * Unit test for new parse plugin selection.
+ * 
+ * @author Sebastien Le Callonnec
+ * @version 1.0
+ */
+@Category(IntegrationTest.class)
+public class TestParserFactory {
+
+  private Configuration conf;
+  private ParserFactory parserFactory;
+
+  /** Inits the Test Case with the test parse-plugin file */
+  @Before
+  public void setUp() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.set("plugin.includes", ".*");
+    conf.set("parse.plugin.file",
+        "org/apache/nutch/parse/parse-plugin-test.xml");
+    parserFactory = new ParserFactory(conf);
+  }
+
+  /** Unit test for <code>getExtensions(String)</code> method. */
+  @Test
+  public void testGetExtensions() throws Exception {
+    Extension ext = parserFactory.getExtensions("text/html").get(0);
+    Assert.assertEquals("parse-tika", ext.getDescriptor().getPluginId());
+    ext = parserFactory.getExtensions("text/html; charset=ISO-8859-1").get(0);
+    Assert.assertEquals("parse-tika", ext.getDescriptor().getPluginId());
+    ext = parserFactory.getExtensions("foo/bar").get(0);
+    Assert.assertEquals("parse-tika", ext.getDescriptor().getPluginId());
+  }
+
+  /** Unit test to check <code>getParsers</code> method */
+  @Test
+  public void testGetParsers() throws Exception {
+    Parser[] parsers = parserFactory.getParsers("text/html", "http://foo.com";);
+    Assert.assertNotNull(parsers);
+    Assert.assertEquals(1, parsers.length);
+    Assert.assertEquals("org.apache.nutch.parse.tika.TikaParser", parsers[0]
+        .getClass().getName());
+
+    parsers = parserFactory.getParsers("text/html; charset=ISO-8859-1",
+        "http://foo.com";);
+    Assert.assertNotNull(parsers);
+    Assert.assertEquals(1, parsers.length);
+    Assert.assertEquals("org.apache.nutch.parse.tika.TikaParser", parsers[0]
+        .getClass().getName());
+
+    parsers = parserFactory.getParsers("application/x-javascript",
+        "http://foo.com";);
+    Assert.assertNotNull(parsers);
+    Assert.assertEquals(1, parsers.length);
+    Assert.assertEquals("org.apache.nutch.parse.js.JSParseFilter", parsers[0]
+        .getClass().getName());
+
+    parsers = parserFactory.getParsers("text/plain", "http://foo.com";);
+    Assert.assertNotNull(parsers);
+    Assert.assertEquals(1, parsers.length);
+    Assert.assertEquals("org.apache.nutch.parse.tika.TikaParser", parsers[0]
+        .getClass().getName());
+
+    Parser parser1 = parserFactory.getParsers("text/plain", 
"http://foo.com";)[0];
+    Parser parser2 = parserFactory.getParsers("*", "http://foo.com";)[0];
+
+    Assert.assertEquals("Different instances!", parser1.hashCode(),
+        parser2.hashCode());
+
+    // test and make sure that the rss parser is loaded even though its
+    // plugin.xml
+    // doesn't claim to support text/rss, only application/rss+xml
+    parsers = parserFactory.getParsers("text/rss", "http://foo.com";);
+    Assert.assertNotNull(parsers);
+    Assert.assertEquals(1, parsers.length);
+    Assert.assertEquals("org.apache.nutch.parse.tika.TikaParser", parsers[0]
+        .getClass().getName());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/parse/parse-plugin-test.xml
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/parse/parse-plugin-test.xml 
b/nutch-core/src/test/java/org/apache/nutch/parse/parse-plugin-test.xml
new file mode 100644
index 0000000..b748905
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/parse/parse-plugin-test.xml
@@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+    Author     : mattmann 
+    Description: Test parse-plugins.xml file. 
+-->
+
+<parse-plugins>
+
+  <!--  by default if the mimeType is set to *, or 
+        if it can't be determined, use parse-tika -->
+  <mimeType name="*">
+    <plugin id="parse-tika" />
+  </mimeType>
+       
+  <!--  test these 4 plugins -->
+  <mimeType name="text/html">
+    <!--
+     ! Test that if a parser cannot be instanciated,
+     ! it should not block the process and then the next one is used
+     !-->
+    <plugin id="parse-plugin-that-not-exist"/>
+  </mimeType>
+        
+  <mimeType name="application/x-javascript">
+    <plugin id="parse-js"/>
+  </mimeType>
+        
+  <mimeType name="text/rss">
+    <!-- Test that an extension-id can be directly used here -->
+    <plugin id="org.apache.nutch.parse.rss.RSSParser"/>
+  </mimeType>
+
+  <!--  alias mappings for parse-xxx names to the actual extension 
implementation 
+  ids described in each plugin's plugin.xml file -->
+  <aliases>
+    <alias name="parse-js"
+           extension-id="JSParser" />
+    <alias name="parse-rss"
+           extension-id="org.apache.nutch.parse.rss.RSSParser" />
+    <alias name="parse-tika"
+           extension-id="org.apache.nutch.parse.tika.TikaParser" />    
+  </aliases>
+</parse-plugins>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/plugin/HelloWorldExtension.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/plugin/HelloWorldExtension.java 
b/nutch-core/src/test/java/org/apache/nutch/plugin/HelloWorldExtension.java
new file mode 100644
index 0000000..fa564c4
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/plugin/HelloWorldExtension.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.plugin;
+
+/**
+ * Simple Test-extensions
+ * 
+ * @author joa23
+ */
+public class HelloWorldExtension implements ITestExtension {
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see
+   * org.apache.nutch.plugin.ITestExtension#testGetExtension(java.lang.String)
+   */
+  public String testGetExtension(String hello) {
+    return hello + " World";
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/plugin/ITestExtension.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/plugin/ITestExtension.java 
b/nutch-core/src/test/java/org/apache/nutch/plugin/ITestExtension.java
new file mode 100644
index 0000000..b6aa81d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/plugin/ITestExtension.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+/**
+ * A Simple Test Extension Interface.
+ * 
+ * @author joa23
+ * 
+ */
+public interface ITestExtension {
+  public String testGetExtension(String hello);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/plugin/SimpleTestPlugin.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/plugin/SimpleTestPlugin.java 
b/nutch-core/src/test/java/org/apache/nutch/plugin/SimpleTestPlugin.java
new file mode 100644
index 0000000..080142d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/plugin/SimpleTestPlugin.java
@@ -0,0 +1,57 @@
+/*
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.plugin;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Simple Test plugin
+ * 
+ * @author joa23
+ */
+public class SimpleTestPlugin extends Plugin {
+
+  /**
+   * @param pDescriptor
+   * @param conf
+   */
+  public SimpleTestPlugin(PluginDescriptor pDescriptor, Configuration conf) {
+
+    super(pDescriptor, conf);
+  }
+
+  /*
+   * @see org.apache.nutch.plugin.Plugin#startUp()
+   */
+  public void startUp() throws PluginRuntimeException {
+    System.err.println("start up Plugin: " + getDescriptor().getPluginId());
+
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see org.apache.nutch.plugin.Plugin#shutDown()
+   */
+  public void shutDown() throws PluginRuntimeException {
+    System.err.println("shutdown Plugin: " + getDescriptor().getPluginId());
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java 
b/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java
new file mode 100644
index 0000000..7bcc9ab
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java
@@ -0,0 +1,305 @@
+/*
+ /**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Properties;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * Unit tests for the plugin system
+ */
+@Category(IntegrationTest.class)
+public class TestPluginSystem {
+  private int fPluginCount;
+
+  private LinkedList<File> fFolders = new LinkedList<File>();
+  private Configuration conf;
+  private PluginRepository repository;
+
+  @Before
+  public void setUp() throws Exception {
+    this.conf = NutchConfiguration.create();
+    conf.set("plugin.includes", ".*");
+    // String string = this.conf.get("plugin.includes", "");
+    // conf.set("plugin.includes", string + "|Dummy*");
+    fPluginCount = 5;
+    createDummyPlugins(fPluginCount);
+    this.repository = PluginRepository.get(conf);
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see junit.framework.TestCase#tearDown()
+   */
+  @After
+  public void tearDown() throws Exception {
+    for (int i = 0; i < fFolders.size(); i++) {
+      File folder = fFolders.get(i);
+      delete(folder);
+      folder.delete();
+    }
+  }
+
+  /**
+   */
+  @Test
+  public void testPluginConfiguration() {
+    String string = getPluginFolder();
+    File file = new File(string);
+    if (!file.exists()) {
+      file.mkdir();
+    }
+    Assert.assertTrue(file.exists());
+  }
+
+  /**
+   */
+  @Test
+  public void testLoadPlugins() {
+    PluginDescriptor[] descriptors = repository.getPluginDescriptors();
+    int k = descriptors.length;
+    Assert.assertTrue(fPluginCount <= k);
+    for (int i = 0; i < descriptors.length; i++) {
+      PluginDescriptor descriptor = descriptors[i];
+      if (!descriptor.getPluginId().startsWith("getPluginFolder()")) {
+        continue;
+      }
+      Assert.assertEquals(1, descriptor.getExportedLibUrls().length);
+      Assert.assertEquals(1, descriptor.getNotExportedLibUrls().length);
+    }
+  }
+
+  @Test
+  public void testRepositoryCache() {
+    Configuration config = NutchConfiguration.create();
+    PluginRepository repo = PluginRepository.get(config);
+    JobConf job = new NutchJob(config);
+    PluginRepository repo1 = PluginRepository.get(job);
+    Assert.assertTrue(repo == repo1);
+    // now construct a config without UUID
+    config = new Configuration();
+    config.addResource("nutch-default.xml");
+    config.addResource("nutch-site.xml");
+    repo = PluginRepository.get(config);
+    job = new NutchJob(config);
+    repo1 = PluginRepository.get(job);
+    Assert.assertTrue(repo1 != repo);
+  }
+
+  /**
+   *  
+   */
+  @Test
+  public void testGetExtensionAndAttributes() {
+    String xpId = " sdsdsd";
+    ExtensionPoint extensionPoint = repository.getExtensionPoint(xpId);
+    Assert.assertEquals(extensionPoint, null);
+    Extension[] extension1 = repository.getExtensionPoint(getGetExtensionId())
+        .getExtensions();
+    Assert.assertEquals(extension1.length, fPluginCount);
+    for (int i = 0; i < extension1.length; i++) {
+      Extension extension2 = extension1[i];
+      String string = extension2.getAttribute(getGetConfigElementName());
+      Assert.assertEquals(string, getParameterValue());
+    }
+  }
+
+  /**
+   * @throws PluginRuntimeException
+   */
+  @Test
+  public void testGetExtensionInstances() throws PluginRuntimeException {
+    Extension[] extensions = repository.getExtensionPoint(getGetExtensionId())
+        .getExtensions();
+    Assert.assertEquals(extensions.length, fPluginCount);
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      Object object = extension.getExtensionInstance();
+      if (!(object instanceof HelloWorldExtension))
+        Assert.fail(" object is not a instance of HelloWorldExtension");
+      ((ITestExtension) object).testGetExtension("Bla ");
+      String string = ((ITestExtension) object).testGetExtension("Hello");
+      Assert.assertEquals("Hello World", string);
+    }
+  }
+
+  /**
+   * 
+   *  
+   */
+  @Test
+  public void testGetClassLoader() {
+    PluginDescriptor[] descriptors = repository.getPluginDescriptors();
+    for (int i = 0; i < descriptors.length; i++) {
+      PluginDescriptor descriptor = descriptors[i];
+      Assert.assertNotNull(descriptor.getClassLoader());
+    }
+  }
+
+  /**
+   * @throws IOException
+   */
+  @Test
+  public void testGetResources() throws IOException {
+    PluginDescriptor[] descriptors = repository.getPluginDescriptors();
+    for (int i = 0; i < descriptors.length; i++) {
+      PluginDescriptor descriptor = descriptors[i];
+      if (!descriptor.getPluginId().startsWith("getPluginFolder()")) {
+        continue;
+      }
+      String value = descriptor.getResourceString("key", Locale.UK);
+      Assert.assertEquals("value", value);
+      value = descriptor.getResourceString("key", Locale.TRADITIONAL_CHINESE);
+      Assert.assertEquals("value", value);
+
+    }
+  }
+
+  /**
+   * @return a PluginFolderPath
+   */
+  private String getPluginFolder() {
+    String[] strings = conf.getStrings("plugin.folders");
+    if (strings == null || strings.length == 0)
+      Assert.fail("no plugin directory setuped..");
+
+    String name = strings[0];
+    return new PluginManifestParser(conf, this.repository)
+        .getPluginFolder(name).toString();
+  }
+
+  /**
+   * Creates some Dummy Plugins
+   * 
+   * @param pCount
+   */
+  private void createDummyPlugins(int pCount) {
+    String string = getPluginFolder();
+    try {
+      File folder = new File(string);
+      folder.mkdir();
+      for (int i = 0; i < pCount; i++) {
+        String pluginFolder = string + File.separator + "DummyPlugin" + i;
+        File file = new File(pluginFolder);
+        file.mkdir();
+        fFolders.add(file);
+        createPluginManifest(i, file.getAbsolutePath());
+        createResourceFile(file.getAbsolutePath());
+      }
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+  }
+
+  /**
+   * Creates an ResourceFile
+   * 
+   * @param pFolderPath
+   * @throws FileNotFoundException
+   * @throws IOException
+   */
+  private void createResourceFile(String pFolderPath)
+      throws FileNotFoundException, IOException {
+    Properties properties = new Properties();
+    properties.setProperty("key", "value");
+    properties.store(new FileOutputStream(pFolderPath + File.separator
+        + "messages" + ".properties"), "");
+  }
+
+  /**
+   * Deletes files in path
+   * 
+   * @param path
+   * @throws IOException
+   */
+  private void delete(File path) throws IOException {
+    File[] files = path.listFiles();
+    for (int i = 0; i < files.length; ++i) {
+      if (files[i].isDirectory())
+        delete(files[i]);
+      files[i].delete();
+    }
+  }
+
+  /**
+   * Creates an Plugin Manifest File
+   * 
+   * @param i
+   * @param pFolderPath
+   * @throws IOException
+   */
+  private void createPluginManifest(int i, String pFolderPath)
+      throws IOException {
+    FileWriter out = new FileWriter(pFolderPath + File.separator + 
"plugin.xml");
+    String xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
+        + "<!--this is just a simple plugin for testing issues.-->"
+        + "<plugin id=\"org.apache.nutch.plugin."
+        + i
+        + "\" name=\""
+        + i
+        + "\" version=\"1.0\" provider-name=\"joa23\" "
+        + "class=\"org.apache.nutch.plugin.SimpleTestPlugin\">"
+        + "<extension-point id=\"aExtensioID\" "
+        + "name=\"simple Parser Extension\" "
+        + "schema=\"schema/testExtensionPoint.exsd\"/>"
+        + "<runtime><library name=\"libs/exported.jar\"><extport/></library>"
+        + "<library name=\"libs/not_exported.jar\"/></runtime>"
+        + "<extension point=\"aExtensioID\">"
+        + "<implementation name=\"simple Parser Extension\" "
+        + "id=\"aExtensionId.\" 
class=\"org.apache.nutch.plugin.HelloWorldExtension\">"
+        + "<parameter name=\"dummy-name\" value=\"a simple param value\"/>"
+        + "</implementation></extension></plugin>";
+    out.write(xml);
+    out.flush();
+    out.close();
+  }
+
+  private String getParameterValue() {
+    return "a simple param value";
+  }
+
+  private static String getGetExtensionId() {
+    return "aExtensioID";
+  }
+
+  private static String getGetConfigElementName() {
+    return "dummy-name";
+  }
+
+  public static void main(String[] args) throws IOException {
+    new TestPluginSystem().createPluginManifest(1, "/");
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/protocol/TestContent.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/protocol/TestContent.java 
b/nutch-core/src/test/java/org/apache/nutch/protocol/TestContent.java
new file mode 100644
index 0000000..1475cda
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/protocol/TestContent.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.WritableTestUtils;
+import org.apache.tika.mime.MimeTypes;
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for Content. */
+
+public class TestContent {
+
+  private static Configuration conf = NutchConfiguration.create();
+
+  @Test
+  public void testContent() throws Exception {
+
+    String page = "<HTML><BODY><H1>Hello World</H1><P>The Quick Brown Fox 
Jumped Over the Lazy Fox.</BODY></HTML>";
+
+    String url = "http://www.foo.com/";;
+
+    SpellCheckedMetadata metaData = new SpellCheckedMetadata();
+    metaData.add("Host", "www.foo.com");
+    metaData.add("Content-Type", "text/html");
+
+    Content r = new Content(url, url, page.getBytes("UTF8"), "text/html",
+        metaData, conf);
+
+    WritableTestUtils.testWritable(r);
+    Assert.assertEquals("text/html", r.getMetadata().get("Content-Type"));
+    Assert.assertEquals("text/html", r.getMetadata().get("content-type"));
+    Assert.assertEquals("text/html", r.getMetadata().get("CONTENTYPE"));
+  }
+
+  /** Unit tests for getContentType(String, String, byte[]) method. */
+  @Test
+  public void testGetContentType() throws Exception {
+    Content c = null;
+    Metadata p = new Metadata();
+
+    c = new Content("http://www.foo.com/";, "http://www.foo.com/";,
+        "".getBytes("UTF8"), "text/html; charset=UTF-8", p, conf);
+    Assert.assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.html";, "http://www.foo.com/";,
+        "".getBytes("UTF8"), "", p, conf);
+    Assert.assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.html";, "http://www.foo.com/";,
+        "".getBytes("UTF8"), null, p, conf);
+    Assert.assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/";, "http://www.foo.com/";,
+        "<html></html>".getBytes("UTF8"), "", p, conf);
+    Assert.assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.html";, "http://www.foo.com/";,
+        "<html></html>".getBytes("UTF8"), "text/plain", p, conf);
+    Assert.assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.png";, "http://www.foo.com/";,
+        "<html></html>".getBytes("UTF8"), "text/plain", p, conf);
+    Assert.assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/";, "http://www.foo.com/";,
+        "".getBytes("UTF8"), "", p, conf);
+    Assert.assertEquals(MimeTypes.OCTET_STREAM, c.getContentType());
+
+    c = new Content("http://www.foo.com/";, "http://www.foo.com/";,
+        "".getBytes("UTF8"), null, p, conf);
+    Assert.assertNotNull(c.getContentType());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java 
b/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java
new file mode 100644
index 0000000..6b4c8fd
--- /dev/null
+++ 
b/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.ObjectCache;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category(IntegrationTest.class)
+public class TestProtocolFactory {
+
+  Configuration conf;
+  ProtocolFactory factory;
+
+  @Before
+  public void setUp() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.set("plugin.includes", ".*");
+    conf.set("http.agent.name", "test-bot");
+    factory = new ProtocolFactory(conf);
+  }
+
+  @Test
+  public void testGetProtocol() {
+
+    // non existing protocol
+    try {
+      factory.getProtocol("xyzxyz://somehost");
+      Assert.fail("Must throw ProtocolNotFound");
+    } catch (ProtocolNotFound e) {
+      // all is ok
+    } catch (Exception ex) {
+      Assert.fail("Must not throw any other exception");
+    }
+
+    Protocol httpProtocol = null;
+
+    // existing protocol
+    try {
+      httpProtocol = factory.getProtocol("http://somehost";);
+      Assert.assertNotNull(httpProtocol);
+    } catch (Exception ex) {
+      Assert.fail("Must not throw any other exception");
+    }
+
+    // cache key
+    Object protocol = ObjectCache.get(conf).getObject(
+        Protocol.X_POINT_ID + "http");
+    Assert.assertNotNull(protocol);
+    Assert.assertEquals(httpProtocol, protocol);
+
+    // test same object instance
+    try {
+      Assert.assertTrue(httpProtocol == 
factory.getProtocol("http://somehost";));
+    } catch (ProtocolNotFound e) {
+      Assert.fail("Must not throw any exception");
+    }
+  }
+
+  @Test
+  public void testContains() {
+    Assert.assertTrue(factory.contains("http", "http"));
+    Assert.assertTrue(factory.contains("http", "http,ftp"));
+    Assert.assertTrue(factory.contains("http", "   http ,   ftp"));
+    Assert.assertTrue(factory.contains("smb", "ftp,smb,http"));
+    Assert.assertFalse(factory.contains("smb", "smbb"));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMerger.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMerger.java 
b/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMerger.java
new file mode 100644
index 0000000..6657c42
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMerger.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.segment;
+
+import java.text.DecimalFormat;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestSegmentMerger {
+  Configuration conf;
+  FileSystem fs;
+  Path testDir;
+  Path seg1;
+  Path seg2;
+  Path out;
+  int countSeg1, countSeg2;
+
+  @Before
+  public void setUp() throws Exception {
+    conf = NutchConfiguration.create();
+    fs = FileSystem.get(conf);
+    testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+    seg1 = new Path(testDir, "seg1");
+    seg2 = new Path(testDir, "seg2");
+    out = new Path(testDir, "out");
+
+    // create large parse-text segments
+    System.err.println("Creating large segment 1...");
+    DecimalFormat df = new DecimalFormat("0000000");
+    Text k = new Text();
+    Path ptPath = new Path(new Path(seg1, ParseText.DIR_NAME), "part-00000");
+    Option kOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option vOpt = 
SequenceFile.Writer.valueClass(ParseText.class);
+    MapFile.Writer w = new MapFile.Writer(conf, ptPath, kOpt, vOpt);
+    long curSize = 0;
+    countSeg1 = 0;
+    FileStatus fileStatus = fs.getFileStatus(ptPath);
+    long blkSize = fileStatus.getBlockSize();
+
+    while (curSize < blkSize * 2) {
+      k.set("seg1-" + df.format(countSeg1));
+      w.append(k, new ParseText("seg1 text " + countSeg1));
+      countSeg1++;
+      curSize += 40; // roughly ...
+    }
+    w.close();
+    System.err.println(" - done: " + countSeg1 + " records.");
+    System.err.println("Creating large segment 2...");
+    ptPath = new Path(new Path(seg2, ParseText.DIR_NAME), "part-00000");
+    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = 
SequenceFile.Writer.valueClass(ParseText.class);
+    w = new MapFile.Writer(conf, ptPath, wKeyOpt, wValueOpt);
+    curSize = 0;
+    countSeg2 = 0;
+    while (curSize < blkSize * 2) {
+      k.set("seg2-" + df.format(countSeg2));
+      w.append(k, new ParseText("seg2 text " + countSeg2));
+      countSeg2++;
+      curSize += 40; // roughly ...
+    }
+    w.close();
+    System.err.println(" - done: " + countSeg2 + " records.");
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    fs.delete(testDir, true);
+  }
+
+  @Test
+  public void testLargeMerge() throws Exception {
+    SegmentMerger merger = new SegmentMerger(conf);
+    merger.merge(out, new Path[] { seg1, seg2 }, false, false, -1);
+    // verify output
+    FileStatus[] stats = fs.listStatus(out);
+    // there should be just one path
+    Assert.assertEquals(1, stats.length);
+    Path outSeg = stats[0].getPath();
+    Text k = new Text();
+    ParseText v = new ParseText();
+    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(
+        outSeg, ParseText.DIR_NAME), conf);
+    int cnt1 = 0, cnt2 = 0;
+    for (MapFile.Reader r : readers) {
+      while (r.next(k, v)) {
+        String ks = k.toString();
+        String vs = v.getText();
+        if (ks.startsWith("seg1-")) {
+          cnt1++;
+          Assert.assertTrue(vs.startsWith("seg1 "));
+        } else if (ks.startsWith("seg2-")) {
+          cnt2++;
+          Assert.assertTrue(vs.startsWith("seg2 "));
+        }
+      }
+      r.close();
+    }
+    Assert.assertEquals(countSeg1, cnt1);
+    Assert.assertEquals(countSeg2, cnt2);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java
 
b/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java
new file mode 100644
index 0000000..aaed8bc
--- /dev/null
+++ 
b/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java
@@ -0,0 +1,427 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.segment;
+
+import java.text.DecimalFormat;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * New SegmentMerger unit test focusing on several crappy issues with the
+ * segment merger. The general problem is disappearing records and incorrect
+ * CrawlDatum status values. This unit test performs random sequences of 
segment
+ * merging where we're looking for an expected status. A second test is able to
+ * randomly inject redirects in segment, likely causing the segment merger to
+ * fail resulting in a bad merged segment.
+ * 
+ * See also:
+ * 
+ * https://issues.apache.org/jira/browse/NUTCH-1113
+ * https://issues.apache.org/jira/browse/NUTCH-1616
+ * https://issues.apache.org/jira/browse/NUTCH-1520
+ * 
+ * Cheers!
+ */
+public class TestSegmentMergerCrawlDatums {
+  Configuration conf;
+  FileSystem fs;
+  Random rnd;
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(TestSegmentMergerCrawlDatums.class);
+
+  @Before
+  public void setUp() throws Exception {
+    conf = NutchConfiguration.create();
+    fs = FileSystem.get(conf);
+    rnd = new Random();
+  }
+
+  /**
+   *
+   */
+  @Test
+  public void testSingleRandomSequence() throws Exception {
+    Assert.assertEquals(
+        new Byte(CrawlDatum.STATUS_FETCH_SUCCESS),
+        new Byte(executeSequence(CrawlDatum.STATUS_FETCH_GONE,
+            CrawlDatum.STATUS_FETCH_SUCCESS, 256, false)));
+  }
+
+  /**
+   *
+   */
+  @Test
+  public void testMostlyRedirects() throws Exception {
+    // Our test directory
+    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+
+    Path segment1 = new Path(testDir, "20140110114943");
+    Path segment2 = new Path(testDir, "20140110114832");
+    Path segment3 = new Path(testDir, "20140110114558");
+    Path segment4 = new Path(testDir, "20140110114930");
+    Path segment5 = new Path(testDir, "20140110114545");
+    Path segment6 = new Path(testDir, "20140110114507");
+    Path segment7 = new Path(testDir, "20140110114903");
+    Path segment8 = new Path(testDir, "20140110114724");
+
+    createSegment(segment1, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+    createSegment(segment2, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+    createSegment(segment3, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+    createSegment(segment4, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+    createSegment(segment5, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+    createSegment(segment6, CrawlDatum.STATUS_FETCH_SUCCESS, false);
+    createSegment(segment7, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+    createSegment(segment8, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+
+    // Merge the segments and get status
+    Path mergedSegment = merge(testDir, new Path[] { segment1, segment2,
+        segment3, segment4, segment5, segment6, segment7, segment8 });
+    Byte status = new Byte(status = checkMergedSegment(testDir, 
mergedSegment));
+
+    Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status);
+  }
+
+  /**
+   *
+   */
+  @Test
+  public void testRandomizedSequences() throws Exception {
+    for (int i = 0; i < rnd.nextInt(16) + 16; i++) {
+      byte expectedStatus = (byte) (rnd.nextInt(6) + 0x21);
+      while (expectedStatus == CrawlDatum.STATUS_FETCH_RETRY
+          || expectedStatus == CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
+        // fetch_retry and fetch_notmodified never remain in a merged segment
+        expectedStatus = (byte) (rnd.nextInt(6) + 0x21);
+      }
+      byte randomStatus = (byte) (rnd.nextInt(6) + 0x21);
+      int rounds = rnd.nextInt(16) + 32;
+      boolean withRedirects = rnd.nextBoolean();
+
+      byte resultStatus = executeSequence(randomStatus, expectedStatus, rounds,
+          withRedirects);
+      Assert.assertEquals(
+          "Expected status = " + CrawlDatum.getStatusName(expectedStatus)
+              + ", but got " + CrawlDatum.getStatusName(resultStatus)
+              + " when merging " + rounds + " segments"
+              + (withRedirects ? " with redirects" : ""), expectedStatus,
+          resultStatus);
+    }
+  }
+
+  /**
+   *
+   */
+  @Test
+  public void testRandomTestSequenceWithRedirects() throws Exception {
+    Assert.assertEquals(
+        new Byte(CrawlDatum.STATUS_FETCH_SUCCESS),
+        new Byte(executeSequence(CrawlDatum.STATUS_FETCH_GONE,
+            CrawlDatum.STATUS_FETCH_SUCCESS, 128, true)));
+  }
+
+  /**
+   * Check a fixed sequence!
+   */
+  @Test
+  public void testFixedSequence() throws Exception {
+    // Our test directory
+    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+
+    Path segment1 = new Path(testDir, "00001");
+    Path segment2 = new Path(testDir, "00002");
+    Path segment3 = new Path(testDir, "00003");
+
+    createSegment(segment1, CrawlDatum.STATUS_FETCH_GONE, false);
+    createSegment(segment2, CrawlDatum.STATUS_FETCH_GONE, true);
+    createSegment(segment3, CrawlDatum.STATUS_FETCH_SUCCESS, false);
+
+    // Merge the segments and get status
+    Path mergedSegment = merge(testDir, new Path[] { segment1, segment2,
+        segment3 });
+    Byte status = new Byte(status = checkMergedSegment(testDir, 
mergedSegment));
+
+    Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status);
+  }
+
+  /**
+   * Check a fixed sequence!
+   */
+  @Test
+  public void testRedirFetchInOneSegment() throws Exception {
+    // Our test directory
+    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+
+    Path segment = new Path(testDir, "00001");
+
+    createSegment(segment, CrawlDatum.STATUS_FETCH_SUCCESS, true, true);
+
+    // Merge the segments and get status
+    Path mergedSegment = merge(testDir, new Path[] { segment });
+    Byte status = new Byte(status = checkMergedSegment(testDir, 
mergedSegment));
+
+    Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status);
+  }
+
+  /**
+   * Check a fixed sequence!
+   */
+  @Test
+  public void testEndsWithRedirect() throws Exception {
+    // Our test directory
+    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+
+    Path segment1 = new Path(testDir, "00001");
+    Path segment2 = new Path(testDir, "00002");
+
+    createSegment(segment1, CrawlDatum.STATUS_FETCH_SUCCESS, false);
+    createSegment(segment2, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+
+    // Merge the segments and get status
+    Path mergedSegment = merge(testDir, new Path[] { segment1, segment2 });
+    Byte status = new Byte(status = checkMergedSegment(testDir, 
mergedSegment));
+
+    Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status);
+  }
+
+  /**
+   * Execute a sequence of creating segments, merging them and checking the
+   * final output
+   * 
+   * @param status
+   *          to start with
+   * @param status
+   *          to end with
+   * @param number
+   *          of rounds
+   * @param whether
+   *          redirects are injected randomly
+   * @return the CrawlDatum status
+   */
+  protected byte executeSequence(byte firstStatus, byte lastStatus, int rounds,
+      boolean redirect) throws Exception {
+    // Our test directory
+    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+
+    // Format for the segments
+    DecimalFormat df = new DecimalFormat("0000000");
+
+    // Create our segment paths
+    Path[] segmentPaths = new Path[rounds];
+    for (int i = 0; i < rounds; i++) {
+      String segmentName = df.format(i);
+      segmentPaths[i] = new Path(testDir, segmentName);
+    }
+
+    // Create the first segment according to the specified status
+    createSegment(segmentPaths[0], firstStatus, false);
+
+    // Create N segments with random status and optionally with randomized
+    // redirect injection
+    for (int i = 1; i < rounds - 1; i++) {
+      // Status, 6 possibilities incremented with 33 hex
+      byte status = (byte) (rnd.nextInt(6) + 0x21);
+
+      // Whether this is going to be a redirect
+      boolean addRedirect = redirect ? rnd.nextBoolean() : false;
+      // If it's a redirect we add a datum resulting from a fetch at random,
+      // if not: always add a fetch datum to avoid empty segments
+      boolean addFetch = addRedirect ? rnd.nextBoolean() : true;
+
+      createSegment(segmentPaths[i], status, addFetch, addRedirect);
+    }
+
+    // Create the last segment according to the specified status
+    // (additionally, add a redirect at random)
+    createSegment(segmentPaths[rounds - 1], lastStatus, true,
+        redirect ? rnd.nextBoolean() : false);
+
+    // Merge the segments!
+    Path mergedSegment = merge(testDir, segmentPaths);
+
+    // Check the status of the final record and return it
+    return checkMergedSegment(testDir, mergedSegment);
+  }
+
+  /**
+   * Checks the merged segment and removes the stuff again.
+   * 
+   * @param the
+   *          test directory
+   * @param the
+   *          merged segment
+   * @return the final status
+   */
+  protected byte checkMergedSegment(Path testDir, Path mergedSegment)
+      throws Exception {
+    // Get a MapFile reader for the <Text,CrawlDatum> pairs
+    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(
+        mergedSegment, CrawlDatum.FETCH_DIR_NAME), conf);
+
+    Text key = new Text();
+    CrawlDatum value = new CrawlDatum();
+    byte finalStatus = 0x0;
+
+    for (MapFile.Reader reader : readers) {
+      while (reader.next(key, value)) {
+        LOG.info("Reading status for: " + key.toString() + " > "
+            + CrawlDatum.getStatusName(value.getStatus()));
+
+        // Only consider fetch status
+        if (CrawlDatum.hasFetchStatus(value)
+            && key.toString().equals("http://nutch.apache.org/";)) {
+          finalStatus = value.getStatus();
+        }
+      }
+
+      // Close the reader again
+      reader.close();
+    }
+
+    // Remove the test directory again
+    fs.delete(testDir, true);
+
+    LOG.info("Final fetch status for: http://nutch.apache.org/ > "
+        + CrawlDatum.getStatusName(finalStatus));
+
+    // Return the final status
+    return finalStatus;
+  }
+
+  /**
+   * Merge some segments!
+   * 
+   * @param the
+   *          test directory
+   * @param the
+   *          segments to merge
+   * @return Path to the merged segment
+   */
+  protected Path merge(Path testDir, Path[] segments) throws Exception {
+    // Our merged output directory
+    Path out = new Path(testDir, "out");
+
+    // Merge
+    SegmentMerger merger = new SegmentMerger(conf);
+    merger.merge(out, segments, false, false, -1);
+
+    FileStatus[] stats = fs.listStatus(out);
+    Assert.assertEquals(1, stats.length);
+
+    return stats[0].getPath();
+  }
+
+  /**
+   * Create a segment with the specified status.
+   * 
+   * @param the
+   *          segment's paths
+   * @param the
+   *          status of the record, ignored if redirect is true
+   * @param whether
+   *          we're doing a redirect as well
+   */
+  protected void createSegment(Path segment, byte status, boolean redirect)
+      throws Exception {
+    if (redirect) {
+      createSegment(segment, status, false, true);
+    } else {
+      createSegment(segment, status, true, false);
+    }
+  }
+
+  protected void createSegment(Path segment, byte status, boolean fetch,
+      boolean redirect) throws Exception {
+    LOG.info("\nSegment: " + segment.toString());
+
+    // The URL of our main record
+    String url = "http://nutch.apache.org/";;
+
+    // The URL of our redirecting URL
+    String redirectUrl = "http://nutch.apache.org/i_redirect_to_the_root/";;
+
+    // Our value
+    CrawlDatum value = new CrawlDatum();
+
+    // Path of the segment's crawl_fetch directory
+    Path crawlFetchPath = new Path(
+        new Path(segment, CrawlDatum.FETCH_DIR_NAME), "part-00000");
+
+    // Get a writer for map files containing <Text,CrawlDatum> pairs
+    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = 
SequenceFile.Writer.valueClass(CrawlDatum.class);
+    MapFile.Writer writer = new MapFile.Writer(conf, crawlFetchPath, wKeyOpt, 
wValueOpt);
+
+    // Whether we're handling a redirect now
+    // first add the linked datum
+    // - before redirect status because url sorts before redirectUrl
+    // - before fetch status to check whether fetch datum is preferred over
+    // linked datum when merging
+    if (redirect) {
+      // We're writing our our main record URL with status linked
+      LOG.info(url + " > " + 
CrawlDatum.getStatusName(CrawlDatum.STATUS_LINKED));
+      value = new CrawlDatum();
+      value.setStatus(CrawlDatum.STATUS_LINKED);
+      writer.append(new Text(url), value);
+    }
+
+    // Whether we're fetching now
+    if (fetch) {
+      LOG.info(url + " > " + CrawlDatum.getStatusName(status));
+
+      // Set the status
+      value.setStatus(status);
+
+      // Write the pair and ok
+      writer.append(new Text(url), value);
+    }
+
+    // Whether we're handing a redirect now
+    if (redirect) {
+      // And the redirect URL with redirect status, pointing to our main URL
+      LOG.info(redirectUrl + " > "
+          + CrawlDatum.getStatusName(CrawlDatum.STATUS_FETCH_REDIR_TEMP));
+      value.setStatus(CrawlDatum.STATUS_FETCH_REDIR_TEMP);
+      writer.append(new Text(redirectUrl), value);
+    }
+
+    // Close the stuff
+    writer.close();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/service/TestNutchServer.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/service/TestNutchServer.java 
b/nutch-core/src/test/java/org/apache/nutch/service/TestNutchServer.java
new file mode 100644
index 0000000..1ee16c4
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/service/TestNutchServer.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.service;
+
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TestNutchServer {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(TestNutchServer.class);
+  NutchServer server = NutchServer.getInstance();
+
+  private int port[] = {8081, 9999, 9100, 8900};
+  private final String ENDPOINT_ADDRESS = "http://localhost:";;
+
+  @Test
+  public void testNutchServerStartup() {
+    boolean isRunning = false;
+    for(int i=0;i<port.length; i++) {
+      try {
+        startServer(port[i]);
+        isRunning = true;
+        break;
+      }catch(Exception e) {
+        LOG.info("Could not start server on port: {}. Tries remaining {}", 
port[i], port.length-i);
+      }
+    }
+    if(!isRunning) {
+      LOG.info("Could not start server, all ports in use");
+    }
+    else {
+      LOG.info("Testing admin endpoint");
+      WebClient client = WebClient.create(ENDPOINT_ADDRESS + server.getPort());
+      Response response = client.path("admin").get();
+      
//Assert.assertTrue(response.readEntity(String.class).contains("startDate"));
+      response = client.path("stop").get();
+      
//Assert.assertTrue(response.readEntity(String.class).contains("Stopping"));
+    }
+  }
+
+  private void startServer(int port) throws Exception{
+    NutchServer.setPort(port);
+    NutchServer.startServer();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/test/IntegrationTest.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/test/IntegrationTest.java 
b/nutch-core/src/test/java/org/apache/nutch/test/IntegrationTest.java
new file mode 100644
index 0000000..131b667
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/test/IntegrationTest.java
@@ -0,0 +1,6 @@
+package org.apache.nutch.test;
+
+/**
+ * A marker interface for marking integration tests
+ */
+public interface IntegrationTest {}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/test/TestUtils.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/test/TestUtils.java 
b/nutch-core/src/test/java/org/apache/nutch/test/TestUtils.java
new file mode 100644
index 0000000..87d37a5
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/test/TestUtils.java
@@ -0,0 +1,29 @@
+package org.apache.nutch.test;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.net.URISyntaxException;
+import java.net.URL;
+
+public class TestUtils {
+
+  /**
+   *
+   * @param obj an object whose class's loader should be used
+   * @param fileName name of file
+   * @return File instance
+   * @throws FileNotFoundException when an error occurs or file is not found
+   */
+  public static File getFile(Object obj, String fileName)
+      throws FileNotFoundException {
+    try {
+      URL resource = obj.getClass().getClassLoader().getResource(fileName);
+      if (resource == null) {
+        throw new FileNotFoundException(fileName + " not known to classloader 
of " + obj);
+      }
+      return new File(resource.toURI());
+    } catch (URISyntaxException e) {
+      throw new FileNotFoundException(e.getMessage());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
 
b/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
new file mode 100644
index 0000000..fef0e69
--- /dev/null
+++ 
b/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+//Junit imports
+import static org.junit.Assert.*;
+
+import org.apache.nutch.test.TestUtils;
+import org.junit.Test;
+
+//Commons imports
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.filefilter.FileFilterUtils;
+
+//JDK imports
+import java.io.File;
+import java.nio.file.Files;
+import java.util.Collection;
+
+//Nutch imports
+import org.apache.nutch.tools.CommonCrawlDataDumper;
+import org.apache.nutch.tools.CommonCrawlConfig;
+
+/**
+ * 
+ * Test harness for the {@link CommonCrawlDataDumper}.
+ *
+ */
+public class TestCommonCrawlDataDumper {
+
+  @Test
+  public void testDump() throws Exception {
+    File sampleSegmentDir = TestUtils.getFile(this, "test-segments");
+    File tempDir = Files.createTempDirectory("temp").toFile();
+
+    String[] crawledFiles = {
+        "c463a4381eb837f9f5d45978cfbde79e_.html",
+        "a974b8d74f7779ab6c6f90b9b279467e_.html",
+        "6bc6497314656a3129732efd708e9f96_.html",
+        "6e88c40abe26cad0a726102997aed048_.html",
+        "5cafdd88f4e9cf3f0cd4c298c6873358_apachecon-europe.html",
+        "932dc10a76e894a2baa8ea4086ad72a8_apachecon-north-america.html",
+        "8540187d75b9cd405b8fa97d665f9f90_.html",
+        "e501bc976c8693b4d28a55b79c390a32_.html",
+        "6add662f9f5758b7d75eec5cfa1f340b_.html",
+        "d4f20df3c37033dc516067ee1f424e4e_.html",
+        "d7b8fa9a02cdc95546030d04be4a98f3_solr.html",
+        "3cbe876e3a8e7a397811de3bb6a945cd_.html",
+        "5b987dde0da79d7f2e3f22b46437f514_bot.html",
+        "3d742820d9a701a1f02e10d5bf5ae633_credits.html",
+        "693673f3c73d04a26276effdea69b7ee_downloads.html",
+        "4f7e3469dafabb4c3b87b00531f81aa4_index.html",
+        "15c5330675be8a69995aab18ff9859e0_javadoc.html",
+        "bc624e1b49e29870ef095819bb0e977a_mailing_lists.html",
+        "a7d66b68754c3665c66e62225255e3fd_version_control.html",
+        "32fb7fe362e1a0d8a1b15addf2a00bdc_1.9-rel",
+        "54ab3db10fe7b26415a04e21045125a8_1zE.html",
+        "1012a41c08092c40340598bd8ee0bfa6_PGa.html",
+        "c830cfc5c28bed10e69d5b83e9c1bcdc_nutch_2.3",
+        "687d915dc264a77f35c61ba841936730_oHY.html",
+        "2bf1afb650010128b4cf4afe677db3c5_1pav9xl.html",
+        "550cab79e14110bbee61c36c61c830b0_1pbE15n.html",
+        "664ff07b46520cc1414494ae49da91f6_.html",
+        "04223714e648a6a43d7c8af8b095f733_.html",
+        "3c8ccb865cd72cca06635d74c7f2f3c4_.html",
+        "90fe47b28716a2230c5122c83f0b8562_Becoming_A_Nutch_Developer.html",
+        "ac0fefe70007d40644e2b8bd5da3c305_FAQ.html",
+        "bc9bc7f11c1262e8924032ab1c7ce112_NutchPropertiesCompleteList.html",
+        "78d04611985e7375b441e478fa36f610_.html",
+        "64adaebadd44e487a8b58894e979dc70_CHANGES.txt",
+        "a48e9c2659b703fdea3ad332877708d8_.html",
+        "159d66d679dd4442d2d8ffe6a83b2912_sponsorship.html",
+        "66f1ce6872c9195c665fc8bdde95f6dc_thanks.html",
+        "ef7ee7e929a048c4a119af78492095b3_.html",
+        "e4251896a982c2b2b68678b5c9c57f4d_.html",
+        "5384764a16fab767ebcbc17d87758a24_.html",
+        "a6ba75a218ef2a09d189cb7dffcecc0f_.html",
+        "f2fa63bd7a3aca63841eed4cd10fb519_SolrCloud.html",
+        "f8de0fbda874e1a140f1b07dcebab374_NUTCH-1047.html",
+        "9c120e94f52d690e9cfd044c34134649_NUTCH-1591.html",
+        "7dd70378379aa452279ce9200d0a5fed_NUTCH-841.html",
+        "ddf78b1fe5c268d59fd62bc745815b92_.html",
+        "401c9f04887dbbf8d29ad52841b8bdb3_ApacheNutch.html",
+        "8f984e2d3c2ba68d1695288f1738deaf_Nutch.html",
+        "c2ef09a95a956207cea073a515172be2_FrontPage.html",
+    "90d9b76e8eabdab1cbcc29bea437c7ae_NutchRESTAPI.html" };
+
+    CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(
+        new CommonCrawlConfig());
+    dumper.dump(tempDir, sampleSegmentDir, null, false, null, false, "", 
false);
+
+    Collection<File> tempFiles = FileUtils.listFiles(tempDir,
+        FileFilterUtils.fileFileFilter(),
+        FileFilterUtils.directoryFileFilter());
+
+    for (String expectedFileName : crawledFiles) {
+      assertTrue("Missed file " + expectedFileName + " in dump", 
+          hasFile(expectedFileName, tempFiles));
+    }
+
+  }
+
+  private boolean hasFile(String fileName, Collection<File> files) {
+    for (File f : files) {
+      if (f.getName().equals(fileName)) {
+        return true;
+      }
+    }
+    return false;
+  }
+}

[28/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Reply via email to