http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/metadata/TestSpellCheckedMetadata.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/metadata/TestSpellCheckedMetadata.java b/nutch-core/src/test/java/org/apache/nutch/metadata/TestSpellCheckedMetadata.java new file mode 100644 index 0000000..ae73ae1 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/metadata/TestSpellCheckedMetadata.java @@ -0,0 +1,303 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metadata; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.Properties; + +import org.junit.Assert; +import org.junit.Test; + +/** + * JUnit based tests of class + * {@link org.apache.nutch.metadata.SpellCheckedMetadata}. + * + * @author Chris Mattmann + * @author Jérôme Charron + */ +public class TestSpellCheckedMetadata { + + private static final int NUM_ITERATIONS = 10000; + + /** Test for the <code>getNormalizedName(String)</code> method. */ + @Test + public void testGetNormalizedName() { + Assert.assertEquals("Content-Type", + SpellCheckedMetadata.getNormalizedName("Content-Type")); + Assert.assertEquals("Content-Type", + SpellCheckedMetadata.getNormalizedName("ContentType")); + Assert.assertEquals("Content-Type", + SpellCheckedMetadata.getNormalizedName("Content-type")); + Assert.assertEquals("Content-Type", + SpellCheckedMetadata.getNormalizedName("contenttype")); + Assert.assertEquals("Content-Type", + SpellCheckedMetadata.getNormalizedName("contentype")); + Assert.assertEquals("Content-Type", + SpellCheckedMetadata.getNormalizedName("contntype")); + } + + /** Test for the <code>add(String, String)</code> method. */ + @Test + public void testAdd() { + String[] values = null; + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + + values = meta.getValues("contentype"); + Assert.assertEquals(0, values.length); + + meta.add("contentype", "value1"); + values = meta.getValues("contentype"); + Assert.assertEquals(1, values.length); + Assert.assertEquals("value1", values[0]); + + meta.add("Content-Type", "value2"); + values = meta.getValues("contentype"); + Assert.assertEquals(2, values.length); + Assert.assertEquals("value1", values[0]); + Assert.assertEquals("value2", values[1]); + + // NOTE : For now, the same value can be added many times. + // Should it be changed? + meta.add("ContentType", "value1"); + values = meta.getValues("Content-Type"); + Assert.assertEquals(3, values.length); + Assert.assertEquals("value1", values[0]); + Assert.assertEquals("value2", values[1]); + Assert.assertEquals("value1", values[2]); + } + + /** Test for the <code>set(String, String)</code> method. */ + @Test + public void testSet() { + String[] values = null; + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + + values = meta.getValues("contentype"); + Assert.assertEquals(0, values.length); + + meta.set("contentype", "value1"); + values = meta.getValues("contentype"); + Assert.assertEquals(1, values.length); + Assert.assertEquals("value1", values[0]); + + meta.set("Content-Type", "value2"); + values = meta.getValues("contentype"); + Assert.assertEquals(1, values.length); + Assert.assertEquals("value2", values[0]); + + meta.set("contenttype", "new value 1"); + meta.add("contenttype", "new value 2"); + values = meta.getValues("contentype"); + Assert.assertEquals(2, values.length); + Assert.assertEquals("new value 1", values[0]); + Assert.assertEquals("new value 2", values[1]); + } + + /** Test for <code>setAll(Properties)</code> method. */ + @Test + public void testSetProperties() { + String[] values = null; + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + Properties props = new Properties(); + + meta.setAll(props); + Assert.assertEquals(0, meta.size()); + + props.setProperty("name-one", "value1.1"); + meta.setAll(props); + Assert.assertEquals(1, meta.size()); + values = meta.getValues("name-one"); + Assert.assertEquals(1, values.length); + Assert.assertEquals("value1.1", values[0]); + + props.setProperty("name-two", "value2.1"); + meta.setAll(props); + Assert.assertEquals(2, meta.size()); + values = meta.getValues("name-one"); + Assert.assertEquals(1, values.length); + Assert.assertEquals("value1.1", values[0]); + values = meta.getValues("name-two"); + Assert.assertEquals(1, values.length); + Assert.assertEquals("value2.1", values[0]); + } + + /** Test for <code>get(String)</code> method. */ + @Test + public void testGet() { + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + Assert.assertNull(meta.get("a-name")); + + meta.add("a-name", "value-1"); + Assert.assertEquals("value-1", meta.get("a-name")); + meta.add("a-name", "value-2"); + Assert.assertEquals("value-1", meta.get("a-name")); + } + + /** Test for <code>isMultiValued()</code> method. */ + @Test + public void testIsMultiValued() { + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + Assert.assertFalse(meta.isMultiValued("key")); + meta.add("key", "value1"); + Assert.assertFalse(meta.isMultiValued("key")); + meta.add("key", "value2"); + Assert.assertTrue(meta.isMultiValued("key")); + } + + /** Test for <code>names</code> method. */ + @Test + public void testNames() { + String[] names = null; + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + names = meta.names(); + Assert.assertEquals(0, names.length); + + meta.add("name-one", "value"); + names = meta.names(); + Assert.assertEquals(1, names.length); + Assert.assertEquals("name-one", names[0]); + meta.add("name-two", "value"); + names = meta.names(); + Assert.assertEquals(2, names.length); + } + + /** Test for <code>remove(String)</code> method. */ + @Test + public void testRemove() { + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + meta.remove("name-one"); + Assert.assertEquals(0, meta.size()); + meta.add("name-one", "value-1.1"); + meta.add("name-one", "value-1.2"); + meta.add("name-two", "value-2.2"); + Assert.assertEquals(2, meta.size()); + Assert.assertNotNull(meta.get("name-one")); + Assert.assertNotNull(meta.get("name-two")); + meta.remove("name-one"); + Assert.assertEquals(1, meta.size()); + Assert.assertNull(meta.get("name-one")); + Assert.assertNotNull(meta.get("name-two")); + meta.remove("name-two"); + Assert.assertEquals(0, meta.size()); + Assert.assertNull(meta.get("name-one")); + Assert.assertNull(meta.get("name-two")); + } + + /** Test for <code>equals(Object)</code> method. */ + @Test + public void testObject() { + SpellCheckedMetadata meta1 = new SpellCheckedMetadata(); + SpellCheckedMetadata meta2 = new SpellCheckedMetadata(); + Assert.assertFalse(meta1.equals(null)); + Assert.assertFalse(meta1.equals("String")); + Assert.assertTrue(meta1.equals(meta2)); + meta1.add("name-one", "value-1.1"); + Assert.assertFalse(meta1.equals(meta2)); + meta2.add("name-one", "value-1.1"); + Assert.assertTrue(meta1.equals(meta2)); + meta1.add("name-one", "value-1.2"); + Assert.assertFalse(meta1.equals(meta2)); + meta2.add("name-one", "value-1.2"); + Assert.assertTrue(meta1.equals(meta2)); + meta1.add("name-two", "value-2.1"); + Assert.assertFalse(meta1.equals(meta2)); + meta2.add("name-two", "value-2.1"); + Assert.assertTrue(meta1.equals(meta2)); + meta1.add("name-two", "value-2.2"); + Assert.assertFalse(meta1.equals(meta2)); + meta2.add("name-two", "value-2.x"); + Assert.assertFalse(meta1.equals(meta2)); + } + + /** Test for <code>Writable</code> implementation. */ + @Test + public void testWritable() { + SpellCheckedMetadata result = null; + SpellCheckedMetadata meta = new SpellCheckedMetadata(); + result = writeRead(meta); + Assert.assertEquals(0, result.size()); + meta.add("name-one", "value-1.1"); + result = writeRead(meta); + meta.add("Contenttype", "text/html"); + Assert.assertEquals(1, result.size()); + Assert.assertEquals(1, result.getValues("name-one").length); + Assert.assertEquals("value-1.1", result.get("name-one")); + meta.add("name-two", "value-2.1"); + meta.add("name-two", "value-2.2"); + result = writeRead(meta); + Assert.assertEquals(3, result.size()); + Assert.assertEquals(1, result.getValues("name-one").length); + Assert.assertEquals("value-1.1", result.getValues("name-one")[0]); + Assert.assertEquals(2, result.getValues("name-two").length); + Assert.assertEquals("value-2.1", result.getValues("name-two")[0]); + Assert.assertEquals("value-2.2", result.getValues("name-two")[1]); + Assert.assertEquals("text/html", result.get(Metadata.CONTENT_TYPE)); + } + + /** + * IO Test method, usable only when you plan to do changes in metadata to + * measure relative performance impact. + */ + @Test + public final void testHandlingSpeed() { + @SuppressWarnings("unused") + SpellCheckedMetadata result; + long start = System.currentTimeMillis(); + for (int i = 0; i < NUM_ITERATIONS; i++) { + SpellCheckedMetadata scmd = constructSpellCheckedMetadata(); + result = writeRead(scmd); + } + System.out.println(NUM_ITERATIONS + " spellchecked metadata I/O time:" + + (System.currentTimeMillis() - start) + "ms."); + } + + private SpellCheckedMetadata writeRead(SpellCheckedMetadata meta) { + SpellCheckedMetadata readed = new SpellCheckedMetadata(); + try { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + meta.write(new DataOutputStream(out)); + readed.readFields(new DataInputStream(new ByteArrayInputStream(out + .toByteArray()))); + } catch (IOException ioe) { + Assert.fail(ioe.toString()); + } + return readed; + } + + /** + * Assembles a Spellchecked metadata Object. + */ + public static final SpellCheckedMetadata constructSpellCheckedMetadata() { + SpellCheckedMetadata scmd = new SpellCheckedMetadata(); + scmd.add("Content-type", "foo/bar"); + scmd.add("Connection", "close"); + scmd.add("Last-Modified", "Sat, 09 Dec 2006 15:09:57 GMT"); + scmd.add("Server", "Foobar"); + scmd.add("Date", "Sat, 09 Dec 2006 18:07:20 GMT"); + scmd.add("Accept-Ranges", "bytes"); + scmd.add("ETag", "\"1234567-89-01234567\""); + scmd.add("Content-Length", "123"); + scmd.add(Nutch.SEGMENT_NAME_KEY, "segmentzzz"); + scmd.add(Nutch.SIGNATURE_KEY, "123"); + return scmd; + } + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java b/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java new file mode 100644 index 0000000..ef07907 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.test.IntegrationTest; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class TestURLFilters { + + /** + * Testcase for NUTCH-325. + * + * @throws URLFilterException + */ + @Test + public void testNonExistingUrlFilter() throws URLFilterException { + Configuration conf = NutchConfiguration.create(); + String class1 = "NonExistingFilter"; + String class2 = "org.apache.nutch.urlfilter.prefix.PrefixURLFilter"; + conf.set(URLFilters.URLFILTER_ORDER, class1 + " " + class2); + + URLFilters normalizers = new URLFilters(conf); + normalizers.filter("http://someurl/"); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java b/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java new file mode 100644 index 0000000..d29e9d3 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.test.IntegrationTest; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class TestURLNormalizers { + + @Test + public void testURLNormalizers() { + Configuration conf = NutchConfiguration.create(); + String clazz1 = "org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer"; + String clazz2 = "org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer"; + conf.set("urlnormalizer.order", clazz1 + " " + clazz2); + + URLNormalizers normalizers = new URLNormalizers(conf, + URLNormalizers.SCOPE_DEFAULT); + + Assert.assertNotNull(normalizers); + try { + normalizers.normalize("http://www.example.com/", + URLNormalizers.SCOPE_DEFAULT); + } catch (MalformedURLException mue) { + Assert.fail(mue.toString()); + } + + // NUTCH-1011 - Get rid of superfluous slashes + try { + String normalizedSlashes = normalizers.normalize( + "http://www.example.com//path/to//somewhere.html", + URLNormalizers.SCOPE_DEFAULT); + Assert.assertEquals(normalizedSlashes, + "http://www.example.com/path/to/somewhere.html"); + } catch (MalformedURLException mue) { + Assert.fail(mue.toString()); + } + + // HostNormalizer NUTCH-1319 + try { + String normalizedHost = normalizers.normalize( + "http://www.example.org//path/to//somewhere.html", + URLNormalizers.SCOPE_DEFAULT); + Assert.assertEquals(normalizedHost, + "http://www.example.org/path/to/somewhere.html"); + } catch (MalformedURLException mue) { + Assert.fail(mue.toString()); + } + + // check the order + int pos1 = -1, pos2 = -1; + URLNormalizer[] impls = normalizers + .getURLNormalizers(URLNormalizers.SCOPE_DEFAULT); + for (int i = 0; i < impls.length; i++) { + if (impls[i].getClass().getName().equals(clazz1)) + pos1 = i; + if (impls[i].getClass().getName().equals(clazz2)) + pos2 = i; + } + if (pos1 != -1 && pos2 != -1) { + Assert.assertTrue("RegexURLNormalizer before BasicURLNormalizer", + pos1 < pos2); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/parse/TestOutlinkExtractor.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/parse/TestOutlinkExtractor.java b/nutch-core/src/test/java/org/apache/nutch/parse/TestOutlinkExtractor.java new file mode 100644 index 0000000..1f2c833 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/parse/TestOutlinkExtractor.java @@ -0,0 +1,99 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.OutlinkExtractor; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +/** + * TestCase to check regExp extraction of URLs. + * + * @author Stephan Strittmatter - http://www.sybit.de + * + * @version 1.0 + */ +public class TestOutlinkExtractor { + + private static Configuration conf = NutchConfiguration.create(); + + @Test + public void testGetNoOutlinks() { + Outlink[] outlinks = null; + + outlinks = OutlinkExtractor.getOutlinks(null, conf); + Assert.assertNotNull(outlinks); + Assert.assertEquals(0, outlinks.length); + + outlinks = OutlinkExtractor.getOutlinks("", conf); + Assert.assertNotNull(outlinks); + Assert.assertEquals(0, outlinks.length); + } + + @Test + public void testGetOutlinksHttp() { + Outlink[] outlinks = OutlinkExtractor + .getOutlinks( + "Test with http://www.nutch.org/index.html is it found? " + + "What about www.google.com at http://www.google.de " + + "A longer URL could be http://www.sybit.com/solutions/portals.html", + conf); + + Assert.assertTrue("Url not found!", outlinks.length == 3); + Assert.assertEquals("Wrong URL", "http://www.nutch.org/index.html", + outlinks[0].getToUrl()); + Assert.assertEquals("Wrong URL", "http://www.google.de", + outlinks[1].getToUrl()); + Assert.assertEquals("Wrong URL", + "http://www.sybit.com/solutions/portals.html", outlinks[2].getToUrl()); + } + + @Test + public void testGetOutlinksHttp2() { + Outlink[] outlinks = OutlinkExtractor + .getOutlinks( + "Test with http://www.nutch.org/index.html is it found? " + + "What about www.google.com at http://www.google.de " + + "A longer URL could be http://www.sybit.com/solutions/portals.html", + "http://www.sybit.de", conf); + + Assert.assertTrue("Url not found!", outlinks.length == 3); + Assert.assertEquals("Wrong URL", "http://www.nutch.org/index.html", + outlinks[0].getToUrl()); + Assert.assertEquals("Wrong URL", "http://www.google.de", + outlinks[1].getToUrl()); + Assert.assertEquals("Wrong URL", + "http://www.sybit.com/solutions/portals.html", outlinks[2].getToUrl()); + } + + @Test + public void testGetOutlinksFtp() { + Outlink[] outlinks = OutlinkExtractor.getOutlinks( + "Test with ftp://www.nutch.org is it found? " + + "What about www.google.com at ftp://www.google.de", conf); + + Assert.assertTrue("Url not found!", outlinks.length > 1); + Assert.assertEquals("Wrong URL", "ftp://www.nutch.org", + outlinks[0].getToUrl()); + Assert.assertEquals("Wrong URL", "ftp://www.google.de", + outlinks[1].getToUrl()); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/parse/TestParseData.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/parse/TestParseData.java b/nutch-core/src/test/java/org/apache/nutch/parse/TestParseData.java new file mode 100644 index 0000000..550a260 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/parse/TestParseData.java @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import org.apache.nutch.util.WritableTestUtils; +import org.apache.nutch.metadata.Metadata; +import org.junit.Assert; +import org.junit.Test; + +/** Unit tests for ParseData. */ + +public class TestParseData { + + @Test + public void testParseData() throws Exception { + + String title = "The Foo Page"; + + Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo"), + new Outlink("http://bar.com/", "Bar") }; + + Metadata metaData = new Metadata(); + metaData.add("Language", "en/us"); + metaData.add("Charset", "UTF-8"); + + ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, + metaData); + + WritableTestUtils.testWritable(r, null); + } + + @Test + public void testMaxOutlinks() throws Exception { + Outlink[] outlinks = new Outlink[128]; + for (int i = 0; i < outlinks.length; i++) { + outlinks[i] = new Outlink("http://outlink.com/" + i, "Outlink" + i); + } + ParseData original = new ParseData(ParseStatus.STATUS_SUCCESS, + "Max Outlinks Title", outlinks, new Metadata()); + ParseData data = (ParseData) WritableTestUtils.writeRead(original, null); + Assert.assertEquals(outlinks.length, data.getOutlinks().length); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/parse/TestParseText.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/parse/TestParseText.java b/nutch-core/src/test/java/org/apache/nutch/parse/TestParseText.java new file mode 100644 index 0000000..241b293 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/parse/TestParseText.java @@ -0,0 +1,34 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import org.apache.nutch.util.WritableTestUtils; +import org.junit.Test; + +/** Unit tests for ParseText. */ + +public class TestParseText { + + @Test + public void testParseText() throws Exception { + + String page = "Hello World The Quick Brown Fox Jumped Over the Lazy Fox"; + ParseText s = new ParseText(page); + WritableTestUtils.testWritable(s); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java b/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java new file mode 100644 index 0000000..198e284 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java @@ -0,0 +1,108 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +// Nutch imports +import org.apache.nutch.plugin.Extension; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.test.IntegrationTest; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +/** + * Unit test for new parse plugin selection. + * + * @author Sebastien Le Callonnec + * @version 1.0 + */ +@Category(IntegrationTest.class) +public class TestParserFactory { + + private Configuration conf; + private ParserFactory parserFactory; + + /** Inits the Test Case with the test parse-plugin file */ + @Before + public void setUp() throws Exception { + conf = NutchConfiguration.create(); + conf.set("plugin.includes", ".*"); + conf.set("parse.plugin.file", + "org/apache/nutch/parse/parse-plugin-test.xml"); + parserFactory = new ParserFactory(conf); + } + + /** Unit test for <code>getExtensions(String)</code> method. */ + @Test + public void testGetExtensions() throws Exception { + Extension ext = parserFactory.getExtensions("text/html").get(0); + Assert.assertEquals("parse-tika", ext.getDescriptor().getPluginId()); + ext = parserFactory.getExtensions("text/html; charset=ISO-8859-1").get(0); + Assert.assertEquals("parse-tika", ext.getDescriptor().getPluginId()); + ext = parserFactory.getExtensions("foo/bar").get(0); + Assert.assertEquals("parse-tika", ext.getDescriptor().getPluginId()); + } + + /** Unit test to check <code>getParsers</code> method */ + @Test + public void testGetParsers() throws Exception { + Parser[] parsers = parserFactory.getParsers("text/html", "http://foo.com"); + Assert.assertNotNull(parsers); + Assert.assertEquals(1, parsers.length); + Assert.assertEquals("org.apache.nutch.parse.tika.TikaParser", parsers[0] + .getClass().getName()); + + parsers = parserFactory.getParsers("text/html; charset=ISO-8859-1", + "http://foo.com"); + Assert.assertNotNull(parsers); + Assert.assertEquals(1, parsers.length); + Assert.assertEquals("org.apache.nutch.parse.tika.TikaParser", parsers[0] + .getClass().getName()); + + parsers = parserFactory.getParsers("application/x-javascript", + "http://foo.com"); + Assert.assertNotNull(parsers); + Assert.assertEquals(1, parsers.length); + Assert.assertEquals("org.apache.nutch.parse.js.JSParseFilter", parsers[0] + .getClass().getName()); + + parsers = parserFactory.getParsers("text/plain", "http://foo.com"); + Assert.assertNotNull(parsers); + Assert.assertEquals(1, parsers.length); + Assert.assertEquals("org.apache.nutch.parse.tika.TikaParser", parsers[0] + .getClass().getName()); + + Parser parser1 = parserFactory.getParsers("text/plain", "http://foo.com")[0]; + Parser parser2 = parserFactory.getParsers("*", "http://foo.com")[0]; + + Assert.assertEquals("Different instances!", parser1.hashCode(), + parser2.hashCode()); + + // test and make sure that the rss parser is loaded even though its + // plugin.xml + // doesn't claim to support text/rss, only application/rss+xml + parsers = parserFactory.getParsers("text/rss", "http://foo.com"); + Assert.assertNotNull(parsers); + Assert.assertEquals(1, parsers.length); + Assert.assertEquals("org.apache.nutch.parse.tika.TikaParser", parsers[0] + .getClass().getName()); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/parse/parse-plugin-test.xml ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/parse/parse-plugin-test.xml b/nutch-core/src/test/java/org/apache/nutch/parse/parse-plugin-test.xml new file mode 100644 index 0000000..b748905 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/parse/parse-plugin-test.xml @@ -0,0 +1,58 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + Author : mattmann + Description: Test parse-plugins.xml file. +--> + +<parse-plugins> + + <!-- by default if the mimeType is set to *, or + if it can't be determined, use parse-tika --> + <mimeType name="*"> + <plugin id="parse-tika" /> + </mimeType> + + <!-- test these 4 plugins --> + <mimeType name="text/html"> + <!-- + ! Test that if a parser cannot be instanciated, + ! it should not block the process and then the next one is used + !--> + <plugin id="parse-plugin-that-not-exist"/> + </mimeType> + + <mimeType name="application/x-javascript"> + <plugin id="parse-js"/> + </mimeType> + + <mimeType name="text/rss"> + <!-- Test that an extension-id can be directly used here --> + <plugin id="org.apache.nutch.parse.rss.RSSParser"/> + </mimeType> + + <!-- alias mappings for parse-xxx names to the actual extension implementation + ids described in each plugin's plugin.xml file --> + <aliases> + <alias name="parse-js" + extension-id="JSParser" /> + <alias name="parse-rss" + extension-id="org.apache.nutch.parse.rss.RSSParser" /> + <alias name="parse-tika" + extension-id="org.apache.nutch.parse.tika.TikaParser" /> + </aliases> +</parse-plugins> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/plugin/HelloWorldExtension.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/plugin/HelloWorldExtension.java b/nutch-core/src/test/java/org/apache/nutch/plugin/HelloWorldExtension.java new file mode 100644 index 0000000..fa564c4 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/plugin/HelloWorldExtension.java @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.plugin; + +/** + * Simple Test-extensions + * + * @author joa23 + */ +public class HelloWorldExtension implements ITestExtension { + + /* + * (non-Javadoc) + * + * @see + * org.apache.nutch.plugin.ITestExtension#testGetExtension(java.lang.String) + */ + public String testGetExtension(String hello) { + return hello + " World"; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/plugin/ITestExtension.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/plugin/ITestExtension.java b/nutch-core/src/test/java/org/apache/nutch/plugin/ITestExtension.java new file mode 100644 index 0000000..b6aa81d --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/plugin/ITestExtension.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +/** + * A Simple Test Extension Interface. + * + * @author joa23 + * + */ +public interface ITestExtension { + public String testGetExtension(String hello); +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/plugin/SimpleTestPlugin.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/plugin/SimpleTestPlugin.java b/nutch-core/src/test/java/org/apache/nutch/plugin/SimpleTestPlugin.java new file mode 100644 index 0000000..080142d --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/plugin/SimpleTestPlugin.java @@ -0,0 +1,57 @@ +/* +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.plugin; + +import org.apache.hadoop.conf.Configuration; + +/** + * Simple Test plugin + * + * @author joa23 + */ +public class SimpleTestPlugin extends Plugin { + + /** + * @param pDescriptor + * @param conf + */ + public SimpleTestPlugin(PluginDescriptor pDescriptor, Configuration conf) { + + super(pDescriptor, conf); + } + + /* + * @see org.apache.nutch.plugin.Plugin#startUp() + */ + public void startUp() throws PluginRuntimeException { + System.err.println("start up Plugin: " + getDescriptor().getPluginId()); + + } + + /* + * (non-Javadoc) + * + * @see org.apache.nutch.plugin.Plugin#shutDown() + */ + public void shutDown() throws PluginRuntimeException { + System.err.println("shutdown Plugin: " + getDescriptor().getPluginId()); + + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java b/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java new file mode 100644 index 0000000..7bcc9ab --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java @@ -0,0 +1,305 @@ +/* + /** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.plugin; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.util.LinkedList; +import java.util.Locale; +import java.util.Properties; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.mapred.JobConf; +import org.apache.nutch.test.IntegrationTest; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +/** + * Unit tests for the plugin system + */ +@Category(IntegrationTest.class) +public class TestPluginSystem { + private int fPluginCount; + + private LinkedList<File> fFolders = new LinkedList<File>(); + private Configuration conf; + private PluginRepository repository; + + @Before + public void setUp() throws Exception { + this.conf = NutchConfiguration.create(); + conf.set("plugin.includes", ".*"); + // String string = this.conf.get("plugin.includes", ""); + // conf.set("plugin.includes", string + "|Dummy*"); + fPluginCount = 5; + createDummyPlugins(fPluginCount); + this.repository = PluginRepository.get(conf); + } + + /* + * (non-Javadoc) + * + * @see junit.framework.TestCase#tearDown() + */ + @After + public void tearDown() throws Exception { + for (int i = 0; i < fFolders.size(); i++) { + File folder = fFolders.get(i); + delete(folder); + folder.delete(); + } + } + + /** + */ + @Test + public void testPluginConfiguration() { + String string = getPluginFolder(); + File file = new File(string); + if (!file.exists()) { + file.mkdir(); + } + Assert.assertTrue(file.exists()); + } + + /** + */ + @Test + public void testLoadPlugins() { + PluginDescriptor[] descriptors = repository.getPluginDescriptors(); + int k = descriptors.length; + Assert.assertTrue(fPluginCount <= k); + for (int i = 0; i < descriptors.length; i++) { + PluginDescriptor descriptor = descriptors[i]; + if (!descriptor.getPluginId().startsWith("getPluginFolder()")) { + continue; + } + Assert.assertEquals(1, descriptor.getExportedLibUrls().length); + Assert.assertEquals(1, descriptor.getNotExportedLibUrls().length); + } + } + + @Test + public void testRepositoryCache() { + Configuration config = NutchConfiguration.create(); + PluginRepository repo = PluginRepository.get(config); + JobConf job = new NutchJob(config); + PluginRepository repo1 = PluginRepository.get(job); + Assert.assertTrue(repo == repo1); + // now construct a config without UUID + config = new Configuration(); + config.addResource("nutch-default.xml"); + config.addResource("nutch-site.xml"); + repo = PluginRepository.get(config); + job = new NutchJob(config); + repo1 = PluginRepository.get(job); + Assert.assertTrue(repo1 != repo); + } + + /** + * + */ + @Test + public void testGetExtensionAndAttributes() { + String xpId = " sdsdsd"; + ExtensionPoint extensionPoint = repository.getExtensionPoint(xpId); + Assert.assertEquals(extensionPoint, null); + Extension[] extension1 = repository.getExtensionPoint(getGetExtensionId()) + .getExtensions(); + Assert.assertEquals(extension1.length, fPluginCount); + for (int i = 0; i < extension1.length; i++) { + Extension extension2 = extension1[i]; + String string = extension2.getAttribute(getGetConfigElementName()); + Assert.assertEquals(string, getParameterValue()); + } + } + + /** + * @throws PluginRuntimeException + */ + @Test + public void testGetExtensionInstances() throws PluginRuntimeException { + Extension[] extensions = repository.getExtensionPoint(getGetExtensionId()) + .getExtensions(); + Assert.assertEquals(extensions.length, fPluginCount); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + Object object = extension.getExtensionInstance(); + if (!(object instanceof HelloWorldExtension)) + Assert.fail(" object is not a instance of HelloWorldExtension"); + ((ITestExtension) object).testGetExtension("Bla "); + String string = ((ITestExtension) object).testGetExtension("Hello"); + Assert.assertEquals("Hello World", string); + } + } + + /** + * + * + */ + @Test + public void testGetClassLoader() { + PluginDescriptor[] descriptors = repository.getPluginDescriptors(); + for (int i = 0; i < descriptors.length; i++) { + PluginDescriptor descriptor = descriptors[i]; + Assert.assertNotNull(descriptor.getClassLoader()); + } + } + + /** + * @throws IOException + */ + @Test + public void testGetResources() throws IOException { + PluginDescriptor[] descriptors = repository.getPluginDescriptors(); + for (int i = 0; i < descriptors.length; i++) { + PluginDescriptor descriptor = descriptors[i]; + if (!descriptor.getPluginId().startsWith("getPluginFolder()")) { + continue; + } + String value = descriptor.getResourceString("key", Locale.UK); + Assert.assertEquals("value", value); + value = descriptor.getResourceString("key", Locale.TRADITIONAL_CHINESE); + Assert.assertEquals("value", value); + + } + } + + /** + * @return a PluginFolderPath + */ + private String getPluginFolder() { + String[] strings = conf.getStrings("plugin.folders"); + if (strings == null || strings.length == 0) + Assert.fail("no plugin directory setuped.."); + + String name = strings[0]; + return new PluginManifestParser(conf, this.repository) + .getPluginFolder(name).toString(); + } + + /** + * Creates some Dummy Plugins + * + * @param pCount + */ + private void createDummyPlugins(int pCount) { + String string = getPluginFolder(); + try { + File folder = new File(string); + folder.mkdir(); + for (int i = 0; i < pCount; i++) { + String pluginFolder = string + File.separator + "DummyPlugin" + i; + File file = new File(pluginFolder); + file.mkdir(); + fFolders.add(file); + createPluginManifest(i, file.getAbsolutePath()); + createResourceFile(file.getAbsolutePath()); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + + /** + * Creates an ResourceFile + * + * @param pFolderPath + * @throws FileNotFoundException + * @throws IOException + */ + private void createResourceFile(String pFolderPath) + throws FileNotFoundException, IOException { + Properties properties = new Properties(); + properties.setProperty("key", "value"); + properties.store(new FileOutputStream(pFolderPath + File.separator + + "messages" + ".properties"), ""); + } + + /** + * Deletes files in path + * + * @param path + * @throws IOException + */ + private void delete(File path) throws IOException { + File[] files = path.listFiles(); + for (int i = 0; i < files.length; ++i) { + if (files[i].isDirectory()) + delete(files[i]); + files[i].delete(); + } + } + + /** + * Creates an Plugin Manifest File + * + * @param i + * @param pFolderPath + * @throws IOException + */ + private void createPluginManifest(int i, String pFolderPath) + throws IOException { + FileWriter out = new FileWriter(pFolderPath + File.separator + "plugin.xml"); + String xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + + "<!--this is just a simple plugin for testing issues.-->" + + "<plugin id=\"org.apache.nutch.plugin." + + i + + "\" name=\"" + + i + + "\" version=\"1.0\" provider-name=\"joa23\" " + + "class=\"org.apache.nutch.plugin.SimpleTestPlugin\">" + + "<extension-point id=\"aExtensioID\" " + + "name=\"simple Parser Extension\" " + + "schema=\"schema/testExtensionPoint.exsd\"/>" + + "<runtime><library name=\"libs/exported.jar\"><extport/></library>" + + "<library name=\"libs/not_exported.jar\"/></runtime>" + + "<extension point=\"aExtensioID\">" + + "<implementation name=\"simple Parser Extension\" " + + "id=\"aExtensionId.\" class=\"org.apache.nutch.plugin.HelloWorldExtension\">" + + "<parameter name=\"dummy-name\" value=\"a simple param value\"/>" + + "</implementation></extension></plugin>"; + out.write(xml); + out.flush(); + out.close(); + } + + private String getParameterValue() { + return "a simple param value"; + } + + private static String getGetExtensionId() { + return "aExtensioID"; + } + + private static String getGetConfigElementName() { + return "dummy-name"; + } + + public static void main(String[] args) throws IOException { + new TestPluginSystem().createPluginManifest(1, "/"); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/protocol/TestContent.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/protocol/TestContent.java b/nutch-core/src/test/java/org/apache/nutch/protocol/TestContent.java new file mode 100644 index 0000000..1475cda --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/protocol/TestContent.java @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol; + +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.SpellCheckedMetadata; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.WritableTestUtils; +import org.apache.tika.mime.MimeTypes; +import org.junit.Assert; +import org.junit.Test; + +/** Unit tests for Content. */ + +public class TestContent { + + private static Configuration conf = NutchConfiguration.create(); + + @Test + public void testContent() throws Exception { + + String page = "<HTML><BODY><H1>Hello World</H1><P>The Quick Brown Fox Jumped Over the Lazy Fox.</BODY></HTML>"; + + String url = "http://www.foo.com/"; + + SpellCheckedMetadata metaData = new SpellCheckedMetadata(); + metaData.add("Host", "www.foo.com"); + metaData.add("Content-Type", "text/html"); + + Content r = new Content(url, url, page.getBytes("UTF8"), "text/html", + metaData, conf); + + WritableTestUtils.testWritable(r); + Assert.assertEquals("text/html", r.getMetadata().get("Content-Type")); + Assert.assertEquals("text/html", r.getMetadata().get("content-type")); + Assert.assertEquals("text/html", r.getMetadata().get("CONTENTYPE")); + } + + /** Unit tests for getContentType(String, String, byte[]) method. */ + @Test + public void testGetContentType() throws Exception { + Content c = null; + Metadata p = new Metadata(); + + c = new Content("http://www.foo.com/", "http://www.foo.com/", + "".getBytes("UTF8"), "text/html; charset=UTF-8", p, conf); + Assert.assertEquals("text/html", c.getContentType()); + + c = new Content("http://www.foo.com/foo.html", "http://www.foo.com/", + "".getBytes("UTF8"), "", p, conf); + Assert.assertEquals("text/html", c.getContentType()); + + c = new Content("http://www.foo.com/foo.html", "http://www.foo.com/", + "".getBytes("UTF8"), null, p, conf); + Assert.assertEquals("text/html", c.getContentType()); + + c = new Content("http://www.foo.com/", "http://www.foo.com/", + "<html></html>".getBytes("UTF8"), "", p, conf); + Assert.assertEquals("text/html", c.getContentType()); + + c = new Content("http://www.foo.com/foo.html", "http://www.foo.com/", + "<html></html>".getBytes("UTF8"), "text/plain", p, conf); + Assert.assertEquals("text/html", c.getContentType()); + + c = new Content("http://www.foo.com/foo.png", "http://www.foo.com/", + "<html></html>".getBytes("UTF8"), "text/plain", p, conf); + Assert.assertEquals("text/html", c.getContentType()); + + c = new Content("http://www.foo.com/", "http://www.foo.com/", + "".getBytes("UTF8"), "", p, conf); + Assert.assertEquals(MimeTypes.OCTET_STREAM, c.getContentType()); + + c = new Content("http://www.foo.com/", "http://www.foo.com/", + "".getBytes("UTF8"), null, p, conf); + Assert.assertNotNull(c.getContentType()); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java b/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java new file mode 100644 index 0000000..6b4c8fd --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.test.IntegrationTest; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.ObjectCache; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class TestProtocolFactory { + + Configuration conf; + ProtocolFactory factory; + + @Before + public void setUp() throws Exception { + conf = NutchConfiguration.create(); + conf.set("plugin.includes", ".*"); + conf.set("http.agent.name", "test-bot"); + factory = new ProtocolFactory(conf); + } + + @Test + public void testGetProtocol() { + + // non existing protocol + try { + factory.getProtocol("xyzxyz://somehost"); + Assert.fail("Must throw ProtocolNotFound"); + } catch (ProtocolNotFound e) { + // all is ok + } catch (Exception ex) { + Assert.fail("Must not throw any other exception"); + } + + Protocol httpProtocol = null; + + // existing protocol + try { + httpProtocol = factory.getProtocol("http://somehost"); + Assert.assertNotNull(httpProtocol); + } catch (Exception ex) { + Assert.fail("Must not throw any other exception"); + } + + // cache key + Object protocol = ObjectCache.get(conf).getObject( + Protocol.X_POINT_ID + "http"); + Assert.assertNotNull(protocol); + Assert.assertEquals(httpProtocol, protocol); + + // test same object instance + try { + Assert.assertTrue(httpProtocol == factory.getProtocol("http://somehost")); + } catch (ProtocolNotFound e) { + Assert.fail("Must not throw any exception"); + } + } + + @Test + public void testContains() { + Assert.assertTrue(factory.contains("http", "http")); + Assert.assertTrue(factory.contains("http", "http,ftp")); + Assert.assertTrue(factory.contains("http", " http , ftp")); + Assert.assertTrue(factory.contains("smb", "ftp,smb,http")); + Assert.assertFalse(factory.contains("smb", "smbb")); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMerger.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMerger.java b/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMerger.java new file mode 100644 index 0000000..6657c42 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMerger.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.segment; + +import java.text.DecimalFormat; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.MapFile.Writer.Option; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.MapFileOutputFormat; +import org.apache.nutch.parse.ParseText; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestSegmentMerger { + Configuration conf; + FileSystem fs; + Path testDir; + Path seg1; + Path seg2; + Path out; + int countSeg1, countSeg2; + + @Before + public void setUp() throws Exception { + conf = NutchConfiguration.create(); + fs = FileSystem.get(conf); + testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + + System.currentTimeMillis()); + seg1 = new Path(testDir, "seg1"); + seg2 = new Path(testDir, "seg2"); + out = new Path(testDir, "out"); + + // create large parse-text segments + System.err.println("Creating large segment 1..."); + DecimalFormat df = new DecimalFormat("0000000"); + Text k = new Text(); + Path ptPath = new Path(new Path(seg1, ParseText.DIR_NAME), "part-00000"); + Option kOpt = MapFile.Writer.keyClass(Text.class); + org.apache.hadoop.io.SequenceFile.Writer.Option vOpt = SequenceFile.Writer.valueClass(ParseText.class); + MapFile.Writer w = new MapFile.Writer(conf, ptPath, kOpt, vOpt); + long curSize = 0; + countSeg1 = 0; + FileStatus fileStatus = fs.getFileStatus(ptPath); + long blkSize = fileStatus.getBlockSize(); + + while (curSize < blkSize * 2) { + k.set("seg1-" + df.format(countSeg1)); + w.append(k, new ParseText("seg1 text " + countSeg1)); + countSeg1++; + curSize += 40; // roughly ... + } + w.close(); + System.err.println(" - done: " + countSeg1 + " records."); + System.err.println("Creating large segment 2..."); + ptPath = new Path(new Path(seg2, ParseText.DIR_NAME), "part-00000"); + Option wKeyOpt = MapFile.Writer.keyClass(Text.class); + org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(ParseText.class); + w = new MapFile.Writer(conf, ptPath, wKeyOpt, wValueOpt); + curSize = 0; + countSeg2 = 0; + while (curSize < blkSize * 2) { + k.set("seg2-" + df.format(countSeg2)); + w.append(k, new ParseText("seg2 text " + countSeg2)); + countSeg2++; + curSize += 40; // roughly ... + } + w.close(); + System.err.println(" - done: " + countSeg2 + " records."); + } + + @After + public void tearDown() throws Exception { + fs.delete(testDir, true); + } + + @Test + public void testLargeMerge() throws Exception { + SegmentMerger merger = new SegmentMerger(conf); + merger.merge(out, new Path[] { seg1, seg2 }, false, false, -1); + // verify output + FileStatus[] stats = fs.listStatus(out); + // there should be just one path + Assert.assertEquals(1, stats.length); + Path outSeg = stats[0].getPath(); + Text k = new Text(); + ParseText v = new ParseText(); + MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path( + outSeg, ParseText.DIR_NAME), conf); + int cnt1 = 0, cnt2 = 0; + for (MapFile.Reader r : readers) { + while (r.next(k, v)) { + String ks = k.toString(); + String vs = v.getText(); + if (ks.startsWith("seg1-")) { + cnt1++; + Assert.assertTrue(vs.startsWith("seg1 ")); + } else if (ks.startsWith("seg2-")) { + cnt2++; + Assert.assertTrue(vs.startsWith("seg2 ")); + } + } + r.close(); + } + Assert.assertEquals(countSeg1, cnt1); + Assert.assertEquals(countSeg2, cnt2); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java b/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java new file mode 100644 index 0000000..aaed8bc --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java @@ -0,0 +1,427 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.segment; + +import java.text.DecimalFormat; +import java.util.Random; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.MapFile.Writer.Option; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.MapFileOutputFormat; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * New SegmentMerger unit test focusing on several crappy issues with the + * segment merger. The general problem is disappearing records and incorrect + * CrawlDatum status values. This unit test performs random sequences of segment + * merging where we're looking for an expected status. A second test is able to + * randomly inject redirects in segment, likely causing the segment merger to + * fail resulting in a bad merged segment. + * + * See also: + * + * https://issues.apache.org/jira/browse/NUTCH-1113 + * https://issues.apache.org/jira/browse/NUTCH-1616 + * https://issues.apache.org/jira/browse/NUTCH-1520 + * + * Cheers! + */ +public class TestSegmentMergerCrawlDatums { + Configuration conf; + FileSystem fs; + Random rnd; + + private static final Logger LOG = LoggerFactory + .getLogger(TestSegmentMergerCrawlDatums.class); + + @Before + public void setUp() throws Exception { + conf = NutchConfiguration.create(); + fs = FileSystem.get(conf); + rnd = new Random(); + } + + /** + * + */ + @Test + public void testSingleRandomSequence() throws Exception { + Assert.assertEquals( + new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), + new Byte(executeSequence(CrawlDatum.STATUS_FETCH_GONE, + CrawlDatum.STATUS_FETCH_SUCCESS, 256, false))); + } + + /** + * + */ + @Test + public void testMostlyRedirects() throws Exception { + // Our test directory + Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + + System.currentTimeMillis()); + + Path segment1 = new Path(testDir, "20140110114943"); + Path segment2 = new Path(testDir, "20140110114832"); + Path segment3 = new Path(testDir, "20140110114558"); + Path segment4 = new Path(testDir, "20140110114930"); + Path segment5 = new Path(testDir, "20140110114545"); + Path segment6 = new Path(testDir, "20140110114507"); + Path segment7 = new Path(testDir, "20140110114903"); + Path segment8 = new Path(testDir, "20140110114724"); + + createSegment(segment1, CrawlDatum.STATUS_FETCH_SUCCESS, true); + createSegment(segment2, CrawlDatum.STATUS_FETCH_SUCCESS, true); + createSegment(segment3, CrawlDatum.STATUS_FETCH_SUCCESS, true); + createSegment(segment4, CrawlDatum.STATUS_FETCH_SUCCESS, true); + createSegment(segment5, CrawlDatum.STATUS_FETCH_SUCCESS, true); + createSegment(segment6, CrawlDatum.STATUS_FETCH_SUCCESS, false); + createSegment(segment7, CrawlDatum.STATUS_FETCH_SUCCESS, true); + createSegment(segment8, CrawlDatum.STATUS_FETCH_SUCCESS, true); + + // Merge the segments and get status + Path mergedSegment = merge(testDir, new Path[] { segment1, segment2, + segment3, segment4, segment5, segment6, segment7, segment8 }); + Byte status = new Byte(status = checkMergedSegment(testDir, mergedSegment)); + + Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status); + } + + /** + * + */ + @Test + public void testRandomizedSequences() throws Exception { + for (int i = 0; i < rnd.nextInt(16) + 16; i++) { + byte expectedStatus = (byte) (rnd.nextInt(6) + 0x21); + while (expectedStatus == CrawlDatum.STATUS_FETCH_RETRY + || expectedStatus == CrawlDatum.STATUS_FETCH_NOTMODIFIED) { + // fetch_retry and fetch_notmodified never remain in a merged segment + expectedStatus = (byte) (rnd.nextInt(6) + 0x21); + } + byte randomStatus = (byte) (rnd.nextInt(6) + 0x21); + int rounds = rnd.nextInt(16) + 32; + boolean withRedirects = rnd.nextBoolean(); + + byte resultStatus = executeSequence(randomStatus, expectedStatus, rounds, + withRedirects); + Assert.assertEquals( + "Expected status = " + CrawlDatum.getStatusName(expectedStatus) + + ", but got " + CrawlDatum.getStatusName(resultStatus) + + " when merging " + rounds + " segments" + + (withRedirects ? " with redirects" : ""), expectedStatus, + resultStatus); + } + } + + /** + * + */ + @Test + public void testRandomTestSequenceWithRedirects() throws Exception { + Assert.assertEquals( + new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), + new Byte(executeSequence(CrawlDatum.STATUS_FETCH_GONE, + CrawlDatum.STATUS_FETCH_SUCCESS, 128, true))); + } + + /** + * Check a fixed sequence! + */ + @Test + public void testFixedSequence() throws Exception { + // Our test directory + Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + + System.currentTimeMillis()); + + Path segment1 = new Path(testDir, "00001"); + Path segment2 = new Path(testDir, "00002"); + Path segment3 = new Path(testDir, "00003"); + + createSegment(segment1, CrawlDatum.STATUS_FETCH_GONE, false); + createSegment(segment2, CrawlDatum.STATUS_FETCH_GONE, true); + createSegment(segment3, CrawlDatum.STATUS_FETCH_SUCCESS, false); + + // Merge the segments and get status + Path mergedSegment = merge(testDir, new Path[] { segment1, segment2, + segment3 }); + Byte status = new Byte(status = checkMergedSegment(testDir, mergedSegment)); + + Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status); + } + + /** + * Check a fixed sequence! + */ + @Test + public void testRedirFetchInOneSegment() throws Exception { + // Our test directory + Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + + System.currentTimeMillis()); + + Path segment = new Path(testDir, "00001"); + + createSegment(segment, CrawlDatum.STATUS_FETCH_SUCCESS, true, true); + + // Merge the segments and get status + Path mergedSegment = merge(testDir, new Path[] { segment }); + Byte status = new Byte(status = checkMergedSegment(testDir, mergedSegment)); + + Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status); + } + + /** + * Check a fixed sequence! + */ + @Test + public void testEndsWithRedirect() throws Exception { + // Our test directory + Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + + System.currentTimeMillis()); + + Path segment1 = new Path(testDir, "00001"); + Path segment2 = new Path(testDir, "00002"); + + createSegment(segment1, CrawlDatum.STATUS_FETCH_SUCCESS, false); + createSegment(segment2, CrawlDatum.STATUS_FETCH_SUCCESS, true); + + // Merge the segments and get status + Path mergedSegment = merge(testDir, new Path[] { segment1, segment2 }); + Byte status = new Byte(status = checkMergedSegment(testDir, mergedSegment)); + + Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status); + } + + /** + * Execute a sequence of creating segments, merging them and checking the + * final output + * + * @param status + * to start with + * @param status + * to end with + * @param number + * of rounds + * @param whether + * redirects are injected randomly + * @return the CrawlDatum status + */ + protected byte executeSequence(byte firstStatus, byte lastStatus, int rounds, + boolean redirect) throws Exception { + // Our test directory + Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + + System.currentTimeMillis()); + + // Format for the segments + DecimalFormat df = new DecimalFormat("0000000"); + + // Create our segment paths + Path[] segmentPaths = new Path[rounds]; + for (int i = 0; i < rounds; i++) { + String segmentName = df.format(i); + segmentPaths[i] = new Path(testDir, segmentName); + } + + // Create the first segment according to the specified status + createSegment(segmentPaths[0], firstStatus, false); + + // Create N segments with random status and optionally with randomized + // redirect injection + for (int i = 1; i < rounds - 1; i++) { + // Status, 6 possibilities incremented with 33 hex + byte status = (byte) (rnd.nextInt(6) + 0x21); + + // Whether this is going to be a redirect + boolean addRedirect = redirect ? rnd.nextBoolean() : false; + // If it's a redirect we add a datum resulting from a fetch at random, + // if not: always add a fetch datum to avoid empty segments + boolean addFetch = addRedirect ? rnd.nextBoolean() : true; + + createSegment(segmentPaths[i], status, addFetch, addRedirect); + } + + // Create the last segment according to the specified status + // (additionally, add a redirect at random) + createSegment(segmentPaths[rounds - 1], lastStatus, true, + redirect ? rnd.nextBoolean() : false); + + // Merge the segments! + Path mergedSegment = merge(testDir, segmentPaths); + + // Check the status of the final record and return it + return checkMergedSegment(testDir, mergedSegment); + } + + /** + * Checks the merged segment and removes the stuff again. + * + * @param the + * test directory + * @param the + * merged segment + * @return the final status + */ + protected byte checkMergedSegment(Path testDir, Path mergedSegment) + throws Exception { + // Get a MapFile reader for the <Text,CrawlDatum> pairs + MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path( + mergedSegment, CrawlDatum.FETCH_DIR_NAME), conf); + + Text key = new Text(); + CrawlDatum value = new CrawlDatum(); + byte finalStatus = 0x0; + + for (MapFile.Reader reader : readers) { + while (reader.next(key, value)) { + LOG.info("Reading status for: " + key.toString() + " > " + + CrawlDatum.getStatusName(value.getStatus())); + + // Only consider fetch status + if (CrawlDatum.hasFetchStatus(value) + && key.toString().equals("http://nutch.apache.org/")) { + finalStatus = value.getStatus(); + } + } + + // Close the reader again + reader.close(); + } + + // Remove the test directory again + fs.delete(testDir, true); + + LOG.info("Final fetch status for: http://nutch.apache.org/ > " + + CrawlDatum.getStatusName(finalStatus)); + + // Return the final status + return finalStatus; + } + + /** + * Merge some segments! + * + * @param the + * test directory + * @param the + * segments to merge + * @return Path to the merged segment + */ + protected Path merge(Path testDir, Path[] segments) throws Exception { + // Our merged output directory + Path out = new Path(testDir, "out"); + + // Merge + SegmentMerger merger = new SegmentMerger(conf); + merger.merge(out, segments, false, false, -1); + + FileStatus[] stats = fs.listStatus(out); + Assert.assertEquals(1, stats.length); + + return stats[0].getPath(); + } + + /** + * Create a segment with the specified status. + * + * @param the + * segment's paths + * @param the + * status of the record, ignored if redirect is true + * @param whether + * we're doing a redirect as well + */ + protected void createSegment(Path segment, byte status, boolean redirect) + throws Exception { + if (redirect) { + createSegment(segment, status, false, true); + } else { + createSegment(segment, status, true, false); + } + } + + protected void createSegment(Path segment, byte status, boolean fetch, + boolean redirect) throws Exception { + LOG.info("\nSegment: " + segment.toString()); + + // The URL of our main record + String url = "http://nutch.apache.org/"; + + // The URL of our redirecting URL + String redirectUrl = "http://nutch.apache.org/i_redirect_to_the_root/"; + + // Our value + CrawlDatum value = new CrawlDatum(); + + // Path of the segment's crawl_fetch directory + Path crawlFetchPath = new Path( + new Path(segment, CrawlDatum.FETCH_DIR_NAME), "part-00000"); + + // Get a writer for map files containing <Text,CrawlDatum> pairs + Option wKeyOpt = MapFile.Writer.keyClass(Text.class); + org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class); + MapFile.Writer writer = new MapFile.Writer(conf, crawlFetchPath, wKeyOpt, wValueOpt); + + // Whether we're handling a redirect now + // first add the linked datum + // - before redirect status because url sorts before redirectUrl + // - before fetch status to check whether fetch datum is preferred over + // linked datum when merging + if (redirect) { + // We're writing our our main record URL with status linked + LOG.info(url + " > " + CrawlDatum.getStatusName(CrawlDatum.STATUS_LINKED)); + value = new CrawlDatum(); + value.setStatus(CrawlDatum.STATUS_LINKED); + writer.append(new Text(url), value); + } + + // Whether we're fetching now + if (fetch) { + LOG.info(url + " > " + CrawlDatum.getStatusName(status)); + + // Set the status + value.setStatus(status); + + // Write the pair and ok + writer.append(new Text(url), value); + } + + // Whether we're handing a redirect now + if (redirect) { + // And the redirect URL with redirect status, pointing to our main URL + LOG.info(redirectUrl + " > " + + CrawlDatum.getStatusName(CrawlDatum.STATUS_FETCH_REDIR_TEMP)); + value.setStatus(CrawlDatum.STATUS_FETCH_REDIR_TEMP); + writer.append(new Text(redirectUrl), value); + } + + // Close the stuff + writer.close(); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/service/TestNutchServer.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/service/TestNutchServer.java b/nutch-core/src/test/java/org/apache/nutch/service/TestNutchServer.java new file mode 100644 index 0000000..1ee16c4 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/service/TestNutchServer.java @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.service; + +import javax.ws.rs.core.Response; + +import org.apache.cxf.jaxrs.client.WebClient; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TestNutchServer { + + private static final Logger LOG = LoggerFactory.getLogger(TestNutchServer.class); + NutchServer server = NutchServer.getInstance(); + + private int port[] = {8081, 9999, 9100, 8900}; + private final String ENDPOINT_ADDRESS = "http://localhost:"; + + @Test + public void testNutchServerStartup() { + boolean isRunning = false; + for(int i=0;i<port.length; i++) { + try { + startServer(port[i]); + isRunning = true; + break; + }catch(Exception e) { + LOG.info("Could not start server on port: {}. Tries remaining {}", port[i], port.length-i); + } + } + if(!isRunning) { + LOG.info("Could not start server, all ports in use"); + } + else { + LOG.info("Testing admin endpoint"); + WebClient client = WebClient.create(ENDPOINT_ADDRESS + server.getPort()); + Response response = client.path("admin").get(); + //Assert.assertTrue(response.readEntity(String.class).contains("startDate")); + response = client.path("stop").get(); + //Assert.assertTrue(response.readEntity(String.class).contains("Stopping")); + } + } + + private void startServer(int port) throws Exception{ + NutchServer.setPort(port); + NutchServer.startServer(); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/test/IntegrationTest.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/test/IntegrationTest.java b/nutch-core/src/test/java/org/apache/nutch/test/IntegrationTest.java new file mode 100644 index 0000000..131b667 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/test/IntegrationTest.java @@ -0,0 +1,6 @@ +package org.apache.nutch.test; + +/** + * A marker interface for marking integration tests + */ +public interface IntegrationTest {} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/test/TestUtils.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/test/TestUtils.java b/nutch-core/src/test/java/org/apache/nutch/test/TestUtils.java new file mode 100644 index 0000000..87d37a5 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/test/TestUtils.java @@ -0,0 +1,29 @@ +package org.apache.nutch.test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.net.URISyntaxException; +import java.net.URL; + +public class TestUtils { + + /** + * + * @param obj an object whose class's loader should be used + * @param fileName name of file + * @return File instance + * @throws FileNotFoundException when an error occurs or file is not found + */ + public static File getFile(Object obj, String fileName) + throws FileNotFoundException { + try { + URL resource = obj.getClass().getClassLoader().getResource(fileName); + if (resource == null) { + throw new FileNotFoundException(fileName + " not known to classloader of " + obj); + } + return new File(resource.toURI()); + } catch (URISyntaxException e) { + throw new FileNotFoundException(e.getMessage()); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java b/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java new file mode 100644 index 0000000..fef0e69 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java @@ -0,0 +1,126 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tools; + +//Junit imports +import static org.junit.Assert.*; + +import org.apache.nutch.test.TestUtils; +import org.junit.Test; + +//Commons imports +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.filefilter.FileFilterUtils; + +//JDK imports +import java.io.File; +import java.nio.file.Files; +import java.util.Collection; + +//Nutch imports +import org.apache.nutch.tools.CommonCrawlDataDumper; +import org.apache.nutch.tools.CommonCrawlConfig; + +/** + * + * Test harness for the {@link CommonCrawlDataDumper}. + * + */ +public class TestCommonCrawlDataDumper { + + @Test + public void testDump() throws Exception { + File sampleSegmentDir = TestUtils.getFile(this, "test-segments"); + File tempDir = Files.createTempDirectory("temp").toFile(); + + String[] crawledFiles = { + "c463a4381eb837f9f5d45978cfbde79e_.html", + "a974b8d74f7779ab6c6f90b9b279467e_.html", + "6bc6497314656a3129732efd708e9f96_.html", + "6e88c40abe26cad0a726102997aed048_.html", + "5cafdd88f4e9cf3f0cd4c298c6873358_apachecon-europe.html", + "932dc10a76e894a2baa8ea4086ad72a8_apachecon-north-america.html", + "8540187d75b9cd405b8fa97d665f9f90_.html", + "e501bc976c8693b4d28a55b79c390a32_.html", + "6add662f9f5758b7d75eec5cfa1f340b_.html", + "d4f20df3c37033dc516067ee1f424e4e_.html", + "d7b8fa9a02cdc95546030d04be4a98f3_solr.html", + "3cbe876e3a8e7a397811de3bb6a945cd_.html", + "5b987dde0da79d7f2e3f22b46437f514_bot.html", + "3d742820d9a701a1f02e10d5bf5ae633_credits.html", + "693673f3c73d04a26276effdea69b7ee_downloads.html", + "4f7e3469dafabb4c3b87b00531f81aa4_index.html", + "15c5330675be8a69995aab18ff9859e0_javadoc.html", + "bc624e1b49e29870ef095819bb0e977a_mailing_lists.html", + "a7d66b68754c3665c66e62225255e3fd_version_control.html", + "32fb7fe362e1a0d8a1b15addf2a00bdc_1.9-rel", + "54ab3db10fe7b26415a04e21045125a8_1zE.html", + "1012a41c08092c40340598bd8ee0bfa6_PGa.html", + "c830cfc5c28bed10e69d5b83e9c1bcdc_nutch_2.3", + "687d915dc264a77f35c61ba841936730_oHY.html", + "2bf1afb650010128b4cf4afe677db3c5_1pav9xl.html", + "550cab79e14110bbee61c36c61c830b0_1pbE15n.html", + "664ff07b46520cc1414494ae49da91f6_.html", + "04223714e648a6a43d7c8af8b095f733_.html", + "3c8ccb865cd72cca06635d74c7f2f3c4_.html", + "90fe47b28716a2230c5122c83f0b8562_Becoming_A_Nutch_Developer.html", + "ac0fefe70007d40644e2b8bd5da3c305_FAQ.html", + "bc9bc7f11c1262e8924032ab1c7ce112_NutchPropertiesCompleteList.html", + "78d04611985e7375b441e478fa36f610_.html", + "64adaebadd44e487a8b58894e979dc70_CHANGES.txt", + "a48e9c2659b703fdea3ad332877708d8_.html", + "159d66d679dd4442d2d8ffe6a83b2912_sponsorship.html", + "66f1ce6872c9195c665fc8bdde95f6dc_thanks.html", + "ef7ee7e929a048c4a119af78492095b3_.html", + "e4251896a982c2b2b68678b5c9c57f4d_.html", + "5384764a16fab767ebcbc17d87758a24_.html", + "a6ba75a218ef2a09d189cb7dffcecc0f_.html", + "f2fa63bd7a3aca63841eed4cd10fb519_SolrCloud.html", + "f8de0fbda874e1a140f1b07dcebab374_NUTCH-1047.html", + "9c120e94f52d690e9cfd044c34134649_NUTCH-1591.html", + "7dd70378379aa452279ce9200d0a5fed_NUTCH-841.html", + "ddf78b1fe5c268d59fd62bc745815b92_.html", + "401c9f04887dbbf8d29ad52841b8bdb3_ApacheNutch.html", + "8f984e2d3c2ba68d1695288f1738deaf_Nutch.html", + "c2ef09a95a956207cea073a515172be2_FrontPage.html", + "90d9b76e8eabdab1cbcc29bea437c7ae_NutchRESTAPI.html" }; + + CommonCrawlDataDumper dumper = new CommonCrawlDataDumper( + new CommonCrawlConfig()); + dumper.dump(tempDir, sampleSegmentDir, null, false, null, false, "", false); + + Collection<File> tempFiles = FileUtils.listFiles(tempDir, + FileFilterUtils.fileFileFilter(), + FileFilterUtils.directoryFileFilter()); + + for (String expectedFileName : crawledFiles) { + assertTrue("Missed file " + expectedFileName + " in dump", + hasFile(expectedFileName, tempFiles)); + } + + } + + private boolean hasFile(String fileName, Collection<File> files) { + for (File f : files) { + if (f.getName().equals(fileName)) { + return true; + } + } + return false; + } +}
