http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java b/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java deleted file mode 100644 index b86181e..0000000 --- a/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java +++ /dev/null @@ -1,61 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.urlfilter.regex; - -// JDK imports -import java.io.IOException; -import java.io.Reader; - -import org.apache.nutch.net.*; -// Nutch imports -import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest; -import org.junit.Assert; -import org.junit.Test; - -/** - * JUnit based test of class <code>RegexURLFilter</code>. - * - * @author Jérôme Charron - */ -public class TestRegexURLFilter extends RegexURLFilterBaseTest { - - protected URLFilter getURLFilter(Reader rules) { - try { - return new RegexURLFilter(rules); - } catch (IOException e) { - Assert.fail(e.toString()); - return null; - } - } - - @Test - public void test() { - test("WholeWebCrawling"); - test("IntranetCrawling"); - bench(50, "Benchmarks"); - bench(100, "Benchmarks"); - bench(200, "Benchmarks"); - bench(400, "Benchmarks"); - bench(800, "Benchmarks"); - } - - @Test - public void test1838() { - test("nutch1838"); - } - -}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-suffix/src/test/java/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-suffix/src/test/java/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java b/nutch-plugins/urlfilter-suffix/src/test/java/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java new file mode 100644 index 0000000..b09ca2f --- /dev/null +++ b/nutch-plugins/urlfilter-suffix/src/test/java/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java @@ -0,0 +1,123 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.suffix; + +import java.io.IOException; +import java.io.StringReader; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/** + * JUnit test for <code>SuffixURLFilter</code>. + * + * @author Andrzej Bialecki + */ +public class TestSuffixURLFilter { + private static final String suffixes = "# this is a comment\n" + "\n" + + ".gif\n" + ".jpg\n" + ".js\n"; + + private static final String[] urls = new String[] { + "http://www.example.com/test.gif", "http://www.example.com/TEST.GIF", + "http://www.example.com/test.jpg", "http://www.example.com/test.JPG", + "http://www.example.com/test.html", "http://www.example.com/test.HTML", + "http://www.example.com/test.html?q=abc.js", + "http://www.example.com/test.js?foo=bar&baz=bar#12333", }; + + private static String[] urlsModeAccept = new String[] { null, urls[1], null, + urls[3], urls[4], urls[5], null, urls[7] }; + + private static String[] urlsModeReject = new String[] { urls[0], null, + urls[2], null, null, null, urls[6], null }; + + private static String[] urlsModeAcceptIgnoreCase = new String[] { null, null, + null, null, urls[4], urls[5], null, urls[7] }; + + private static String[] urlsModeRejectIgnoreCase = new String[] { urls[0], + urls[1], urls[2], urls[3], null, null, urls[6], null }; + + private static String[] urlsModeAcceptAndPathFilter = new String[] { null, + urls[1], null, urls[3], urls[4], urls[5], urls[6], null }; + + private static String[] urlsModeAcceptAndNonPathFilter = new String[] { null, + urls[1], null, urls[3], urls[4], urls[5], null, urls[7] }; + + private SuffixURLFilter filter = null; + + @Before + public void setUp() throws IOException { + filter = new SuffixURLFilter(new StringReader(suffixes)); + } + + @Test + public void testModeAccept() { + filter.setIgnoreCase(false); + filter.setModeAccept(true); + for (int i = 0; i < urls.length; i++) { + Assert.assertTrue(urlsModeAccept[i] == filter.filter(urls[i])); + } + } + + @Test + public void testModeReject() { + filter.setIgnoreCase(false); + filter.setModeAccept(false); + for (int i = 0; i < urls.length; i++) { + Assert.assertTrue(urlsModeReject[i] == filter.filter(urls[i])); + } + } + + @Test + public void testModeAcceptIgnoreCase() { + filter.setIgnoreCase(true); + filter.setModeAccept(true); + for (int i = 0; i < urls.length; i++) { + Assert.assertTrue(urlsModeAcceptIgnoreCase[i] == filter.filter(urls[i])); + } + } + + @Test + public void testModeRejectIgnoreCase() { + filter.setIgnoreCase(true); + filter.setModeAccept(false); + for (int i = 0; i < urls.length; i++) { + Assert.assertTrue(urlsModeRejectIgnoreCase[i] == filter.filter(urls[i])); + } + } + + @Test + public void testModeAcceptAndNonPathFilter() { + filter.setModeAccept(true); + filter.setFilterFromPath(false); + for (int i = 0; i < urls.length; i++) { + Assert.assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter + .filter(urls[i])); + } + } + + @Test + public void testModeAcceptAndPathFilter() { + filter.setModeAccept(true); + filter.setFilterFromPath(true); + for (int i = 0; i < urls.length; i++) { + Assert.assertTrue(urlsModeAcceptAndPathFilter[i] == filter + .filter(urls[i])); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java b/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java deleted file mode 100644 index b09ca2f..0000000 --- a/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java +++ /dev/null @@ -1,123 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.urlfilter.suffix; - -import java.io.IOException; -import java.io.StringReader; - -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -/** - * JUnit test for <code>SuffixURLFilter</code>. - * - * @author Andrzej Bialecki - */ -public class TestSuffixURLFilter { - private static final String suffixes = "# this is a comment\n" + "\n" - + ".gif\n" + ".jpg\n" + ".js\n"; - - private static final String[] urls = new String[] { - "http://www.example.com/test.gif", "http://www.example.com/TEST.GIF", - "http://www.example.com/test.jpg", "http://www.example.com/test.JPG", - "http://www.example.com/test.html", "http://www.example.com/test.HTML", - "http://www.example.com/test.html?q=abc.js", - "http://www.example.com/test.js?foo=bar&baz=bar#12333", }; - - private static String[] urlsModeAccept = new String[] { null, urls[1], null, - urls[3], urls[4], urls[5], null, urls[7] }; - - private static String[] urlsModeReject = new String[] { urls[0], null, - urls[2], null, null, null, urls[6], null }; - - private static String[] urlsModeAcceptIgnoreCase = new String[] { null, null, - null, null, urls[4], urls[5], null, urls[7] }; - - private static String[] urlsModeRejectIgnoreCase = new String[] { urls[0], - urls[1], urls[2], urls[3], null, null, urls[6], null }; - - private static String[] urlsModeAcceptAndPathFilter = new String[] { null, - urls[1], null, urls[3], urls[4], urls[5], urls[6], null }; - - private static String[] urlsModeAcceptAndNonPathFilter = new String[] { null, - urls[1], null, urls[3], urls[4], urls[5], null, urls[7] }; - - private SuffixURLFilter filter = null; - - @Before - public void setUp() throws IOException { - filter = new SuffixURLFilter(new StringReader(suffixes)); - } - - @Test - public void testModeAccept() { - filter.setIgnoreCase(false); - filter.setModeAccept(true); - for (int i = 0; i < urls.length; i++) { - Assert.assertTrue(urlsModeAccept[i] == filter.filter(urls[i])); - } - } - - @Test - public void testModeReject() { - filter.setIgnoreCase(false); - filter.setModeAccept(false); - for (int i = 0; i < urls.length; i++) { - Assert.assertTrue(urlsModeReject[i] == filter.filter(urls[i])); - } - } - - @Test - public void testModeAcceptIgnoreCase() { - filter.setIgnoreCase(true); - filter.setModeAccept(true); - for (int i = 0; i < urls.length; i++) { - Assert.assertTrue(urlsModeAcceptIgnoreCase[i] == filter.filter(urls[i])); - } - } - - @Test - public void testModeRejectIgnoreCase() { - filter.setIgnoreCase(true); - filter.setModeAccept(false); - for (int i = 0; i < urls.length; i++) { - Assert.assertTrue(urlsModeRejectIgnoreCase[i] == filter.filter(urls[i])); - } - } - - @Test - public void testModeAcceptAndNonPathFilter() { - filter.setModeAccept(true); - filter.setFilterFromPath(false); - for (int i = 0; i < urls.length; i++) { - Assert.assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter - .filter(urls[i])); - } - } - - @Test - public void testModeAcceptAndPathFilter() { - filter.setModeAccept(true); - filter.setFilterFromPath(true); - for (int i = 0; i < urls.length; i++) { - Assert.assertTrue(urlsModeAcceptAndPathFilter[i] == filter - .filter(urls[i])); - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-validator/src/test/java/org/apache/nutch/urlfilter/validator/TestUrlValidator.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-validator/src/test/java/org/apache/nutch/urlfilter/validator/TestUrlValidator.java b/nutch-plugins/urlfilter-validator/src/test/java/org/apache/nutch/urlfilter/validator/TestUrlValidator.java new file mode 100644 index 0000000..2e6d695 --- /dev/null +++ b/nutch-plugins/urlfilter-validator/src/test/java/org/apache/nutch/urlfilter/validator/TestUrlValidator.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.validator; + +import org.apache.nutch.urlfilter.validator.UrlValidator; +import org.junit.Assert; +import org.junit.Test; + +/** + * JUnit test case which tests 1. that valid urls are not filtered while invalid + * ones are filtered. 2. that Urls' scheme, authority, path and query are + * validated. + * + * @author tejasp + * + */ + +public class TestUrlValidator { + + /** + * Test method for + * {@link org.apache.nutch.urlfilter.validator.UrlValidator#filter(java.lang.String)} + * . + */ + @Test + public void testFilter() { + UrlValidator url_validator = new UrlValidator(); + Assert.assertNotNull(url_validator); + + Assert.assertNull("Filtering on a null object should return null", + url_validator.filter(null)); + Assert.assertNull("Invalid url: example.com/file[/].html", + url_validator.filter("example.com/file[/].html")); + Assert.assertNull("Invalid url: http://www.example.com/space here.html", + url_validator.filter("http://www.example.com/space here.html")); + Assert.assertNull("Invalid url: /main.html", + url_validator.filter("/main.html")); + Assert.assertNull("Invalid url: www.example.com/main.html", + url_validator.filter("www.example.com/main.html")); + Assert.assertNull("Invalid url: ftp:www.example.com/main.html", + url_validator.filter("ftp:www.example.com/main.html")); + Assert.assertNull( + "Inalid url: http://999.000.456.32/nutch/trunk/README.txt", + url_validator.filter("http://999.000.456.32/nutch/trunk/README.txt")); + Assert.assertNull("Invalid url: http://www.example.com/ma|in\\toc.html", + url_validator.filter(" http://www.example.com/ma|in\\toc.html")); + + Assert.assertNotNull( + "Valid url: https://issues.apache.org/jira/NUTCH-1127", + url_validator.filter("https://issues.apache.org/jira/NUTCH-1127")); + Assert + .assertNotNull( + "Valid url: http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather", + url_validator + .filter("http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather")); + Assert + .assertNotNull( + "Valid url: http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress", + url_validator + .filter("http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress")); + Assert.assertNotNull("Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf", + url_validator.filter("ftp://alfa.bravo.pi/mike/check/plan.pdf")); + + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java b/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java deleted file mode 100644 index 2e6d695..0000000 --- a/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.urlfilter.validator; - -import org.apache.nutch.urlfilter.validator.UrlValidator; -import org.junit.Assert; -import org.junit.Test; - -/** - * JUnit test case which tests 1. that valid urls are not filtered while invalid - * ones are filtered. 2. that Urls' scheme, authority, path and query are - * validated. - * - * @author tejasp - * - */ - -public class TestUrlValidator { - - /** - * Test method for - * {@link org.apache.nutch.urlfilter.validator.UrlValidator#filter(java.lang.String)} - * . - */ - @Test - public void testFilter() { - UrlValidator url_validator = new UrlValidator(); - Assert.assertNotNull(url_validator); - - Assert.assertNull("Filtering on a null object should return null", - url_validator.filter(null)); - Assert.assertNull("Invalid url: example.com/file[/].html", - url_validator.filter("example.com/file[/].html")); - Assert.assertNull("Invalid url: http://www.example.com/space here.html", - url_validator.filter("http://www.example.com/space here.html")); - Assert.assertNull("Invalid url: /main.html", - url_validator.filter("/main.html")); - Assert.assertNull("Invalid url: www.example.com/main.html", - url_validator.filter("www.example.com/main.html")); - Assert.assertNull("Invalid url: ftp:www.example.com/main.html", - url_validator.filter("ftp:www.example.com/main.html")); - Assert.assertNull( - "Inalid url: http://999.000.456.32/nutch/trunk/README.txt", - url_validator.filter("http://999.000.456.32/nutch/trunk/README.txt")); - Assert.assertNull("Invalid url: http://www.example.com/ma|in\\toc.html", - url_validator.filter(" http://www.example.com/ma|in\\toc.html")); - - Assert.assertNotNull( - "Valid url: https://issues.apache.org/jira/NUTCH-1127", - url_validator.filter("https://issues.apache.org/jira/NUTCH-1127")); - Assert - .assertNotNull( - "Valid url: http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather", - url_validator - .filter("http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather")); - Assert - .assertNotNull( - "Valid url: http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress", - url_validator - .filter("http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress")); - Assert.assertNotNull("Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf", - url_validator.filter("ftp://alfa.bravo.pi/mike/check/plan.pdf")); - - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-ajax/src/test/java/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-ajax/src/test/java/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java b/nutch-plugins/urlnormalizer-ajax/src/test/java/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java new file mode 100644 index 0000000..d815c45 --- /dev/null +++ b/nutch-plugins/urlnormalizer-ajax/src/test/java/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.ajax; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +/** Unit tests for AjaxURLNormalizer. */ +public class TestAjaxURLNormalizer extends TestCase { + private AjaxURLNormalizer normalizer; + private Configuration conf; + + public TestAjaxURLNormalizer(String name) { + super(name); + normalizer = new AjaxURLNormalizer(); + conf = NutchConfiguration.create(); + normalizer.setConf(conf); + } + + public void testNormalizer() throws Exception { + // check if AJAX URL's are normalized to an _escaped_frament_ form + normalizeTest("http://example.org/#!k=v", "http://example.org/?_escaped_fragment_=k=v"); + + // Check with some escaped chars + normalizeTest("http://example.org/#!k=v&something=is wrong", "http://example.org/?_escaped_fragment_=k=v%26something=is%20wrong"); + + // Check with query string and multiple fragment params + normalizeTest("http://example.org/path.html?queryparam=queryvalue#!key1=value1&key2=value2", "http://example.org/path.html?queryparam=queryvalue&_escaped_fragment_=key1=value1%26key2=value2"); + } + + public void testNormalizerWhenIndexing() throws Exception { + // check if it works the other way around + normalizeTest("http://example.org/?_escaped_fragment_=key=value", "http://example.org/#!key=value", URLNormalizers.SCOPE_INDEXER); + normalizeTest("http://example.org/?key=value&_escaped_fragment_=key=value", "http://example.org/?key=value#!key=value", URLNormalizers.SCOPE_INDEXER); + normalizeTest("http://example.org/page.html?key=value&_escaped_fragment_=key=value%26something=is%20wrong", "http://example.org/page.html?key=value#!key=value&something=is wrong", URLNormalizers.SCOPE_INDEXER); + } + + private void normalizeTest(String weird, String normal) throws Exception { + assertEquals(normal, normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT)); + } + + private void normalizeTest(String weird, String normal, String scope) throws Exception { + assertEquals(normal, normalizer.normalize(weird, scope)); + } + + public static void main(String[] args) throws Exception { + new TestAjaxURLNormalizer("test").testNormalizer(); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java b/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java deleted file mode 100644 index d815c45..0000000 --- a/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java +++ /dev/null @@ -1,67 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.net.urlnormalizer.ajax; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.net.URLNormalizers; -import org.apache.nutch.util.NutchConfiguration; - -import junit.framework.TestCase; - -/** Unit tests for AjaxURLNormalizer. */ -public class TestAjaxURLNormalizer extends TestCase { - private AjaxURLNormalizer normalizer; - private Configuration conf; - - public TestAjaxURLNormalizer(String name) { - super(name); - normalizer = new AjaxURLNormalizer(); - conf = NutchConfiguration.create(); - normalizer.setConf(conf); - } - - public void testNormalizer() throws Exception { - // check if AJAX URL's are normalized to an _escaped_frament_ form - normalizeTest("http://example.org/#!k=v", "http://example.org/?_escaped_fragment_=k=v"); - - // Check with some escaped chars - normalizeTest("http://example.org/#!k=v&something=is wrong", "http://example.org/?_escaped_fragment_=k=v%26something=is%20wrong"); - - // Check with query string and multiple fragment params - normalizeTest("http://example.org/path.html?queryparam=queryvalue#!key1=value1&key2=value2", "http://example.org/path.html?queryparam=queryvalue&_escaped_fragment_=key1=value1%26key2=value2"); - } - - public void testNormalizerWhenIndexing() throws Exception { - // check if it works the other way around - normalizeTest("http://example.org/?_escaped_fragment_=key=value", "http://example.org/#!key=value", URLNormalizers.SCOPE_INDEXER); - normalizeTest("http://example.org/?key=value&_escaped_fragment_=key=value", "http://example.org/?key=value#!key=value", URLNormalizers.SCOPE_INDEXER); - normalizeTest("http://example.org/page.html?key=value&_escaped_fragment_=key=value%26something=is%20wrong", "http://example.org/page.html?key=value#!key=value&something=is wrong", URLNormalizers.SCOPE_INDEXER); - } - - private void normalizeTest(String weird, String normal) throws Exception { - assertEquals(normal, normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT)); - } - - private void normalizeTest(String weird, String normal, String scope) throws Exception { - assertEquals(normal, normalizer.normalize(weird, scope)); - } - - public static void main(String[] args) throws Exception { - new TestAjaxURLNormalizer("test").testNormalizer(); - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-basic/src/test/java/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-basic/src/test/java/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/nutch-plugins/urlnormalizer-basic/src/test/java/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java new file mode 100644 index 0000000..9a0f8c4 --- /dev/null +++ b/nutch-plugins/urlnormalizer-basic/src/test/java/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java @@ -0,0 +1,175 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.basic; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +/** Unit tests for BasicURLNormalizer. */ +public class TestBasicURLNormalizer { + private BasicURLNormalizer normalizer; + + private Configuration conf; + + public TestBasicURLNormalizer() { + normalizer = new BasicURLNormalizer(); + conf = NutchConfiguration.create(); + normalizer.setConf(conf); + } + + @Test + public void testNUTCH1098() throws Exception { + // check that % encoding is normalized + normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); + + // check that % encoding works correctly at end of URL + normalizeTest("http://foo.com/%66oo.htm%6c", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/%66oo.ht%6dl", "http://foo.com/foo.html"); + + // check that % decoder do not overlap strings + normalizeTest("http://foo.com/%66oo.ht%6d%6c", "http://foo.com/foo.html"); + + // check that % decoder leaves high bit chars alone + normalizeTest("http://foo.com/%66oo.htm%C0", "http://foo.com/foo.htm%C0"); + + // check that % decoder leaves control chars alone + normalizeTest("http://foo.com/%66oo.htm%1A", "http://foo.com/foo.htm%1A"); + + // check that % decoder converts to upper case letters + normalizeTest("http://foo.com/%66oo.htm%c0", "http://foo.com/foo.htm%C0"); + + // check that % decoder leaves encoded spaces alone + normalizeTest("http://foo.com/you%20too.html", "http://foo.com/you%20too.html"); + + // check that spaces are encoded into %20 + normalizeTest("http://foo.com/you too.html", "http://foo.com/you%20too.html"); + + // check that encoded # are not decoded + normalizeTest("http://foo.com/file.html%23cz", "http://foo.com/file.html%23cz"); + + // check that encoded / are not decoded + normalizeTest("http://foo.com/fast/dir%2fcz", "http://foo.com/fast/dir%2Fcz"); + + // check that control chars are encoded + normalizeTest("http://foo.com/\u001a!", "http://foo.com/%1A!"); + + // check that control chars are always encoded into 2 digits + normalizeTest("http://foo.com/\u0001!", "http://foo.com/%01!"); + + // check encoding of spanish chars + normalizeTest("http://mydomain.com/en Espa\u00F1ol.aspx", "http://mydomain.com/en%20Espa%C3%B1ol.aspx"); + } + + @Test + public void testNUTCH2064() throws Exception { + // Ampersand and colon and other punctuation characters are not to be unescaped + normalizeTest("http://x.com/s?q=a%26b&m=10", "http://x.com/s?q=a%26b&m=10"); + normalizeTest("http://x.com/show?http%3A%2F%2Fx.com%2Fb", + "http://x.com/show?http%3A%2F%2Fx.com%2Fb"); + normalizeTest("http://google.com/search?q=c%2B%2B", + "http://google.com/search?q=c%2B%2B"); + // do also not touch the query part which is application/x-www-form-urlencoded + normalizeTest("http://x.com/s?q=a+b", "http://x.com/s?q=a+b"); + // and keep Internationalized domain names + // http://bücher.de/ may be http://xn--bcher-kva.de/ + // but definitely not http://b%C3%BCcher.de/ + normalizeTest("http://b\u00fccher.de/", "http://b\u00fccher.de/"); + // test whether percent-encoding works together with other normalizations + normalizeTest("http://x.com/./a/../%66.html", "http://x.com/f.html"); + // [ and ] need escaping as well + normalizeTest("http://x.com/?x[y]=1", "http://x.com/?x%5By%5D=1"); + // boundary test for first character outside the ASCII range (U+0080) + normalizeTest("http://x.com/foo\u0080", "http://x.com/foo%C2%80"); + normalizeTest("http://x.com/foo%c2%80", "http://x.com/foo%C2%80"); + } + + @Test + public void testNormalizer() throws Exception { + // check that leading and trailing spaces are removed + normalizeTest(" http://foo.com/ ", "http://foo.com/"); + + // check that protocol is lower cased + normalizeTest("HTTP://foo.com/", "http://foo.com/"); + + // check that host is lower cased + normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html"); + normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html"); + + // check that port number is normalized + normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html"); + normalizeTest("http://foo.com:81/", "http://foo.com:81/"); + + // check that null path is normalized + normalizeTest("http://foo.com", "http://foo.com/"); + + // check that references are removed + normalizeTest("http://foo.com/foo.html#ref", "http://foo.com/foo.html"); + + // // check that encoding is normalized + // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); + + // check that unnecessary "../" are removed + + normalizeTest("http://foo.com/aa/./foo.html", "http://foo.com/aa/foo.html"); + normalizeTest("http://foo.com/aa/../", "http://foo.com/"); + normalizeTest("http://foo.com/aa/bb/../", "http://foo.com/aa/"); + normalizeTest("http://foo.com/aa/..", "http://foo.com/"); + normalizeTest("http://foo.com/aa/bb/cc/../../foo.html", + "http://foo.com/aa/foo.html"); + normalizeTest("http://foo.com/aa/bb/../cc/dd/../ee/foo.html", + "http://foo.com/aa/cc/ee/foo.html"); + normalizeTest("http://foo.com/../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/../../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/aa/../../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/aa/../bb/../foo.html/../../", + "http://foo.com/"); + normalizeTest("http://foo.com/../aa/foo.html", "http://foo.com/aa/foo.html"); + normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/a..a/foo.html", + "http://foo.com/a..a/foo.html"); + normalizeTest("http://foo.com/a..a/../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/foo.foo/../foo.html", + "http://foo.com/foo.html"); + normalizeTest("http://foo.com//aa/bb/foo.html", + "http://foo.com/aa/bb/foo.html"); + normalizeTest("http://foo.com/aa//bb/foo.html", + "http://foo.com/aa/bb/foo.html"); + normalizeTest("http://foo.com/aa/bb//foo.html", + "http://foo.com/aa/bb/foo.html"); + normalizeTest("http://foo.com//aa//bb//foo.html", + "http://foo.com/aa/bb/foo.html"); + normalizeTest("http://foo.com////aa////bb////foo.html", + "http://foo.com/aa/bb/foo.html"); + normalizeTest("http://foo.com/aa?referer=http://bar.com", + "http://foo.com/aa?referer=http://bar.com"); + } + + private void normalizeTest(String weird, String normal) throws Exception { + Assert.assertEquals("normalizing: " + weird, normal, + normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT)); + } + + public static void main(String[] args) throws Exception { + new TestBasicURLNormalizer().testNormalizer(); + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java deleted file mode 100644 index 9a0f8c4..0000000 --- a/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java +++ /dev/null @@ -1,175 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.net.urlnormalizer.basic; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.net.URLNormalizers; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -/** Unit tests for BasicURLNormalizer. */ -public class TestBasicURLNormalizer { - private BasicURLNormalizer normalizer; - - private Configuration conf; - - public TestBasicURLNormalizer() { - normalizer = new BasicURLNormalizer(); - conf = NutchConfiguration.create(); - normalizer.setConf(conf); - } - - @Test - public void testNUTCH1098() throws Exception { - // check that % encoding is normalized - normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); - - // check that % encoding works correctly at end of URL - normalizeTest("http://foo.com/%66oo.htm%6c", "http://foo.com/foo.html"); - normalizeTest("http://foo.com/%66oo.ht%6dl", "http://foo.com/foo.html"); - - // check that % decoder do not overlap strings - normalizeTest("http://foo.com/%66oo.ht%6d%6c", "http://foo.com/foo.html"); - - // check that % decoder leaves high bit chars alone - normalizeTest("http://foo.com/%66oo.htm%C0", "http://foo.com/foo.htm%C0"); - - // check that % decoder leaves control chars alone - normalizeTest("http://foo.com/%66oo.htm%1A", "http://foo.com/foo.htm%1A"); - - // check that % decoder converts to upper case letters - normalizeTest("http://foo.com/%66oo.htm%c0", "http://foo.com/foo.htm%C0"); - - // check that % decoder leaves encoded spaces alone - normalizeTest("http://foo.com/you%20too.html", "http://foo.com/you%20too.html"); - - // check that spaces are encoded into %20 - normalizeTest("http://foo.com/you too.html", "http://foo.com/you%20too.html"); - - // check that encoded # are not decoded - normalizeTest("http://foo.com/file.html%23cz", "http://foo.com/file.html%23cz"); - - // check that encoded / are not decoded - normalizeTest("http://foo.com/fast/dir%2fcz", "http://foo.com/fast/dir%2Fcz"); - - // check that control chars are encoded - normalizeTest("http://foo.com/\u001a!", "http://foo.com/%1A!"); - - // check that control chars are always encoded into 2 digits - normalizeTest("http://foo.com/\u0001!", "http://foo.com/%01!"); - - // check encoding of spanish chars - normalizeTest("http://mydomain.com/en Espa\u00F1ol.aspx", "http://mydomain.com/en%20Espa%C3%B1ol.aspx"); - } - - @Test - public void testNUTCH2064() throws Exception { - // Ampersand and colon and other punctuation characters are not to be unescaped - normalizeTest("http://x.com/s?q=a%26b&m=10", "http://x.com/s?q=a%26b&m=10"); - normalizeTest("http://x.com/show?http%3A%2F%2Fx.com%2Fb", - "http://x.com/show?http%3A%2F%2Fx.com%2Fb"); - normalizeTest("http://google.com/search?q=c%2B%2B", - "http://google.com/search?q=c%2B%2B"); - // do also not touch the query part which is application/x-www-form-urlencoded - normalizeTest("http://x.com/s?q=a+b", "http://x.com/s?q=a+b"); - // and keep Internationalized domain names - // http://bücher.de/ may be http://xn--bcher-kva.de/ - // but definitely not http://b%C3%BCcher.de/ - normalizeTest("http://b\u00fccher.de/", "http://b\u00fccher.de/"); - // test whether percent-encoding works together with other normalizations - normalizeTest("http://x.com/./a/../%66.html", "http://x.com/f.html"); - // [ and ] need escaping as well - normalizeTest("http://x.com/?x[y]=1", "http://x.com/?x%5By%5D=1"); - // boundary test for first character outside the ASCII range (U+0080) - normalizeTest("http://x.com/foo\u0080", "http://x.com/foo%C2%80"); - normalizeTest("http://x.com/foo%c2%80", "http://x.com/foo%C2%80"); - } - - @Test - public void testNormalizer() throws Exception { - // check that leading and trailing spaces are removed - normalizeTest(" http://foo.com/ ", "http://foo.com/"); - - // check that protocol is lower cased - normalizeTest("HTTP://foo.com/", "http://foo.com/"); - - // check that host is lower cased - normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html"); - normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html"); - - // check that port number is normalized - normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html"); - normalizeTest("http://foo.com:81/", "http://foo.com:81/"); - - // check that null path is normalized - normalizeTest("http://foo.com", "http://foo.com/"); - - // check that references are removed - normalizeTest("http://foo.com/foo.html#ref", "http://foo.com/foo.html"); - - // // check that encoding is normalized - // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); - - // check that unnecessary "../" are removed - - normalizeTest("http://foo.com/aa/./foo.html", "http://foo.com/aa/foo.html"); - normalizeTest("http://foo.com/aa/../", "http://foo.com/"); - normalizeTest("http://foo.com/aa/bb/../", "http://foo.com/aa/"); - normalizeTest("http://foo.com/aa/..", "http://foo.com/"); - normalizeTest("http://foo.com/aa/bb/cc/../../foo.html", - "http://foo.com/aa/foo.html"); - normalizeTest("http://foo.com/aa/bb/../cc/dd/../ee/foo.html", - "http://foo.com/aa/cc/ee/foo.html"); - normalizeTest("http://foo.com/../foo.html", "http://foo.com/foo.html"); - normalizeTest("http://foo.com/../../foo.html", "http://foo.com/foo.html"); - normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html"); - normalizeTest("http://foo.com/aa/../../foo.html", "http://foo.com/foo.html"); - normalizeTest("http://foo.com/aa/../bb/../foo.html/../../", - "http://foo.com/"); - normalizeTest("http://foo.com/../aa/foo.html", "http://foo.com/aa/foo.html"); - normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html"); - normalizeTest("http://foo.com/a..a/foo.html", - "http://foo.com/a..a/foo.html"); - normalizeTest("http://foo.com/a..a/../foo.html", "http://foo.com/foo.html"); - normalizeTest("http://foo.com/foo.foo/../foo.html", - "http://foo.com/foo.html"); - normalizeTest("http://foo.com//aa/bb/foo.html", - "http://foo.com/aa/bb/foo.html"); - normalizeTest("http://foo.com/aa//bb/foo.html", - "http://foo.com/aa/bb/foo.html"); - normalizeTest("http://foo.com/aa/bb//foo.html", - "http://foo.com/aa/bb/foo.html"); - normalizeTest("http://foo.com//aa//bb//foo.html", - "http://foo.com/aa/bb/foo.html"); - normalizeTest("http://foo.com////aa////bb////foo.html", - "http://foo.com/aa/bb/foo.html"); - normalizeTest("http://foo.com/aa?referer=http://bar.com", - "http://foo.com/aa?referer=http://bar.com"); - } - - private void normalizeTest(String weird, String normal) throws Exception { - Assert.assertEquals("normalizing: " + weird, normal, - normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT)); - } - - public static void main(String[] args) throws Exception { - new TestBasicURLNormalizer().testNormalizer(); - } - -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-host/src/test/java/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-host/src/test/java/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java b/nutch-plugins/urlnormalizer-host/src/test/java/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java new file mode 100644 index 0000000..c9e1a2c --- /dev/null +++ b/nutch-plugins/urlnormalizer-host/src/test/java/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.host; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +public class TestHostURLNormalizer { + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + @Test + public void testHostURLNormalizer() throws Exception { + Configuration conf = NutchConfiguration.create(); + + String hostsFile = SAMPLES + SEPARATOR + "hosts.txt"; + HostURLNormalizer normalizer = new HostURLNormalizer(hostsFile); + normalizer.setConf(conf); + + // Force www. sub domain when hitting link without sub domain + Assert.assertEquals("http://www.example.org/page.html", + normalizer.normalize("http://example.org/page.html", + URLNormalizers.SCOPE_DEFAULT)); + + // Force no sub domain to www. URL's + Assert.assertEquals("http://example.net/path/to/something.html", normalizer + .normalize("http://www.example.net/path/to/something.html", + URLNormalizers.SCOPE_DEFAULT)); + + // Force all sub domains to www. + Assert.assertEquals("http://example.com/?does=it&still=work", normalizer + .normalize("http://example.com/?does=it&still=work", + URLNormalizers.SCOPE_DEFAULT)); + Assert.assertEquals("http://example.com/buh", normalizer.normalize( + "http://http.www.example.com/buh", URLNormalizers.SCOPE_DEFAULT)); + Assert.assertEquals("http://example.com/blaat", normalizer.normalize( + "http://whatever.example.com/blaat", URLNormalizers.SCOPE_DEFAULT)); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java b/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java deleted file mode 100644 index c9e1a2c..0000000 --- a/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.net.urlnormalizer.host; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.net.URLNormalizers; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -public class TestHostURLNormalizer { - - private final static String SEPARATOR = System.getProperty("file.separator"); - private final static String SAMPLES = System.getProperty("test.data", "."); - - @Test - public void testHostURLNormalizer() throws Exception { - Configuration conf = NutchConfiguration.create(); - - String hostsFile = SAMPLES + SEPARATOR + "hosts.txt"; - HostURLNormalizer normalizer = new HostURLNormalizer(hostsFile); - normalizer.setConf(conf); - - // Force www. sub domain when hitting link without sub domain - Assert.assertEquals("http://www.example.org/page.html", - normalizer.normalize("http://example.org/page.html", - URLNormalizers.SCOPE_DEFAULT)); - - // Force no sub domain to www. URL's - Assert.assertEquals("http://example.net/path/to/something.html", normalizer - .normalize("http://www.example.net/path/to/something.html", - URLNormalizers.SCOPE_DEFAULT)); - - // Force all sub domains to www. - Assert.assertEquals("http://example.com/?does=it&still=work", normalizer - .normalize("http://example.com/?does=it&still=work", - URLNormalizers.SCOPE_DEFAULT)); - Assert.assertEquals("http://example.com/buh", normalizer.normalize( - "http://http.www.example.com/buh", URLNormalizers.SCOPE_DEFAULT)); - Assert.assertEquals("http://example.com/blaat", normalizer.normalize( - "http://whatever.example.com/blaat", URLNormalizers.SCOPE_DEFAULT)); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-pass/src/test/java/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-pass/src/test/java/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java b/nutch-plugins/urlnormalizer-pass/src/test/java/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java new file mode 100644 index 0000000..f470c62 --- /dev/null +++ b/nutch-plugins/urlnormalizer-pass/src/test/java/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.pass; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +public class TestPassURLNormalizer { + + @Test + public void testPassURLNormalizer() { + Configuration conf = NutchConfiguration.create(); + + PassURLNormalizer normalizer = new PassURLNormalizer(); + normalizer.setConf(conf); + String url = "http://www.example.com/test/..//"; + String result = null; + try { + result = normalizer.normalize(url, URLNormalizers.SCOPE_DEFAULT); + } catch (MalformedURLException mue) { + Assert.fail(mue.toString()); + } + + Assert.assertEquals(url, result); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java b/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java deleted file mode 100644 index f470c62..0000000 --- a/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.net.urlnormalizer.pass; - -import java.net.MalformedURLException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.net.URLNormalizers; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -public class TestPassURLNormalizer { - - @Test - public void testPassURLNormalizer() { - Configuration conf = NutchConfiguration.create(); - - PassURLNormalizer normalizer = new PassURLNormalizer(); - normalizer.setConf(conf); - String url = "http://www.example.com/test/..//"; - String result = null; - try { - result = normalizer.normalize(url, URLNormalizers.SCOPE_DEFAULT); - } catch (MalformedURLException mue) { - Assert.fail(mue.toString()); - } - - Assert.assertEquals(url, result); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-protocol/src/test/java/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-protocol/src/test/java/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java b/nutch-plugins/urlnormalizer-protocol/src/test/java/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java new file mode 100644 index 0000000..8880628 --- /dev/null +++ b/nutch-plugins/urlnormalizer-protocol/src/test/java/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.protocol; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestProtocolURLNormalizer extends TestCase { + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + public void testProtocolURLNormalizer() throws Exception { + Configuration conf = NutchConfiguration.create(); + + String protocolsFile = SAMPLES + SEPARATOR + "protocols.txt"; + ProtocolURLNormalizer normalizer = new ProtocolURLNormalizer(protocolsFile); + normalizer.setConf(conf); + + // No change + assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT)); + + // https to http + assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT)); + + // no change + assertEquals("https://example.io/", normalizer.normalize("https://example.io/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("https://example.nl/", normalizer.normalize("https://example.nl/", URLNormalizers.SCOPE_DEFAULT)); + + // http to https + assertEquals("https://example.io/", normalizer.normalize("http://example.io/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("https://example.nl/", normalizer.normalize("http://example.nl/", URLNormalizers.SCOPE_DEFAULT)); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java b/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java deleted file mode 100644 index 8880628..0000000 --- a/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.net.urlnormalizer.protocol; - -import java.net.MalformedURLException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.net.URLNormalizers; -import org.apache.nutch.util.NutchConfiguration; - -import junit.framework.TestCase; - -public class TestProtocolURLNormalizer extends TestCase { - - private final static String SEPARATOR = System.getProperty("file.separator"); - private final static String SAMPLES = System.getProperty("test.data", "."); - - public void testProtocolURLNormalizer() throws Exception { - Configuration conf = NutchConfiguration.create(); - - String protocolsFile = SAMPLES + SEPARATOR + "protocols.txt"; - ProtocolURLNormalizer normalizer = new ProtocolURLNormalizer(protocolsFile); - normalizer.setConf(conf); - - // No change - assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT)); - assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT)); - - // https to http - assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT)); - assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT)); - - // no change - assertEquals("https://example.io/", normalizer.normalize("https://example.io/", URLNormalizers.SCOPE_DEFAULT)); - assertEquals("https://example.nl/", normalizer.normalize("https://example.nl/", URLNormalizers.SCOPE_DEFAULT)); - - // http to https - assertEquals("https://example.io/", normalizer.normalize("http://example.io/", URLNormalizers.SCOPE_DEFAULT)); - assertEquals("https://example.nl/", normalizer.normalize("http://example.nl/", URLNormalizers.SCOPE_DEFAULT)); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-querystring/src/test/java/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-querystring/src/test/java/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java b/nutch-plugins/urlnormalizer-querystring/src/test/java/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java new file mode 100644 index 0000000..b85c55d --- /dev/null +++ b/nutch-plugins/urlnormalizer-querystring/src/test/java/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.querystring; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestQuerystringURLNormalizer extends TestCase { + + public void testQuerystringURLNormalizer() throws Exception { + Configuration conf = NutchConfiguration.create(); + + QuerystringURLNormalizer normalizer = new QuerystringURLNormalizer(); + normalizer.setConf(conf); + + assertEquals("http://example.com/?a=b&c=d", normalizer.normalize( + "http://example.com/?c=d&a=b", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com/a/b/c", normalizer.normalize( + "http://example.com/a/b/c", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com:1234/a/b/c", normalizer.normalize( + "http://example.com:1234/a/b/c", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com:1234/a/b/c#ref", normalizer.normalize( + "http://example.com:1234/a/b/c#ref", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com:1234/a/b/c?a=b&c=d#ref", + normalizer.normalize("http://example.com:1234/a/b/c?c=d&a=b#ref", + URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com/?a=b&a=c&c=d", normalizer.normalize( + "http://example.com/?c=d&a=b&a=c", URLNormalizers.SCOPE_DEFAULT)); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java b/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java deleted file mode 100644 index b85c55d..0000000 --- a/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.net.urlnormalizer.querystring; - -import java.net.MalformedURLException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.net.URLNormalizers; -import org.apache.nutch.util.NutchConfiguration; - -import junit.framework.TestCase; - -public class TestQuerystringURLNormalizer extends TestCase { - - public void testQuerystringURLNormalizer() throws Exception { - Configuration conf = NutchConfiguration.create(); - - QuerystringURLNormalizer normalizer = new QuerystringURLNormalizer(); - normalizer.setConf(conf); - - assertEquals("http://example.com/?a=b&c=d", normalizer.normalize( - "http://example.com/?c=d&a=b", URLNormalizers.SCOPE_DEFAULT)); - assertEquals("http://example.com/a/b/c", normalizer.normalize( - "http://example.com/a/b/c", URLNormalizers.SCOPE_DEFAULT)); - assertEquals("http://example.com:1234/a/b/c", normalizer.normalize( - "http://example.com:1234/a/b/c", URLNormalizers.SCOPE_DEFAULT)); - assertEquals("http://example.com:1234/a/b/c#ref", normalizer.normalize( - "http://example.com:1234/a/b/c#ref", URLNormalizers.SCOPE_DEFAULT)); - assertEquals("http://example.com:1234/a/b/c?a=b&c=d#ref", - normalizer.normalize("http://example.com:1234/a/b/c?c=d&a=b#ref", - URLNormalizers.SCOPE_DEFAULT)); - assertEquals("http://example.com/?a=b&a=c&c=d", normalizer.normalize( - "http://example.com/?c=d&a=b&a=c", URLNormalizers.SCOPE_DEFAULT)); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-regex/src/test/java/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/src/test/java/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java b/nutch-plugins/urlnormalizer-regex/src/test/java/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java new file mode 100644 index 0000000..cbf6c64 --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/src/test/java/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java @@ -0,0 +1,186 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.regex; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileFilter; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.*; + +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +/** Unit tests for RegexUrlNormalizer. */ +public class TestRegexURLNormalizer { + private static final Logger LOG = LoggerFactory + .getLogger(TestRegexURLNormalizer.class); + + private RegexURLNormalizer normalizer; + private Configuration conf; + private Map<String, NormalizedURL[]> testData = new HashMap<String, NormalizedURL[]>(); + + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/urlnormalizer-regex/build.xml during plugin compilation. + + public TestRegexURLNormalizer() throws IOException { + normalizer = new RegexURLNormalizer(); + conf = NutchConfiguration.create(); + normalizer.setConf(conf); + File[] configs = new File(sampleDir).listFiles(new FileFilter() { + public boolean accept(File f) { + if (f.getName().endsWith(".xml") + && f.getName().startsWith("regex-normalize-")) + return true; + return false; + } + }); + for (int i = 0; i < configs.length; i++) { + try { + FileReader reader = new FileReader(configs[i]); + String cname = configs[i].getName(); + cname = cname.substring(16, cname.indexOf(".xml")); + normalizer.setConfiguration(reader, cname); + NormalizedURL[] urls = readTestFile(cname); + testData.put(cname, urls); + } catch (Exception e) { + LOG.warn("Could load config from '" + configs[i] + "': " + e.toString()); + } + } + } + + @Test + public void testNormalizerDefault() throws Exception { + normalizeTest((NormalizedURL[]) testData.get(URLNormalizers.SCOPE_DEFAULT), + URLNormalizers.SCOPE_DEFAULT); + } + + @Test + public void testNormalizerScope() throws Exception { + Iterator<String> it = testData.keySet().iterator(); + while (it.hasNext()) { + String scope = it.next(); + normalizeTest((NormalizedURL[]) testData.get(scope), scope); + } + } + + private void normalizeTest(NormalizedURL[] urls, String scope) + throws Exception { + for (int i = 0; i < urls.length; i++) { + String url = urls[i].url; + String normalized = normalizer.normalize(urls[i].url, scope); + String expected = urls[i].expectedURL; + LOG.info("scope: " + scope + " url: " + url + " | normalized: " + + normalized + " | expected: " + expected); + Assert.assertEquals(urls[i].expectedURL, normalized); + } + } + + private void bench(int loops, String scope) { + long start = System.currentTimeMillis(); + try { + NormalizedURL[] expected = (NormalizedURL[]) testData.get(scope); + if (expected == null) + return; + for (int i = 0; i < loops; i++) { + normalizeTest(expected, scope); + } + } catch (Exception e) { + Assert.fail(e.toString()); + } + LOG.info("bench time (" + loops + ") " + + (System.currentTimeMillis() - start) + "ms"); + } + + private static class NormalizedURL { + String url; + String expectedURL; + + public NormalizedURL(String line) { + String[] fields = line.split("\\s+"); + url = fields[0]; + expectedURL = fields[1]; + } + } + + private NormalizedURL[] readTestFile(String scope) throws IOException { + File f = new File(sampleDir, "regex-normalize-" + scope + ".test"); + @SuppressWarnings("resource") + BufferedReader in = new BufferedReader(new InputStreamReader( + new FileInputStream(f), "UTF-8")); + List<NormalizedURL> list = new ArrayList<NormalizedURL>(); + String line; + while ((line = in.readLine()) != null) { + if (line.trim().length() == 0 || line.startsWith("#") + || line.startsWith(" ")) + continue; + list.add(new NormalizedURL(line)); + } + return (NormalizedURL[]) list.toArray(new NormalizedURL[list.size()]); + } + + public static void main(String[] args) throws Exception { + if (args.length == 0) { + System.err.println("TestRegexURLNormalizer [-bench <iter>] <scope>"); + System.exit(-1); + } + boolean bench = false; + int iter = -1; + String scope = null; + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-bench")) { + bench = true; + iter = Integer.parseInt(args[++i]); + } else + scope = args[i]; + } + if (scope == null) { + System.err.println("Missing required scope name."); + System.exit(-1); + } + if (bench && iter < 0) { + System.err.println("Invalid number of iterations: " + iter); + System.exit(-1); + } + TestRegexURLNormalizer test = new TestRegexURLNormalizer(); + NormalizedURL[] urls = (NormalizedURL[]) test.testData.get(scope); + if (urls == null) { + LOG.warn("Missing test data for scope '" + scope + + "', using default scope."); + scope = URLNormalizers.SCOPE_DEFAULT; + urls = (NormalizedURL[]) test.testData.get(scope); + } + if (bench) { + test.bench(iter, scope); + } else { + test.normalizeTest(urls, scope); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java b/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java deleted file mode 100644 index cbf6c64..0000000 --- a/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java +++ /dev/null @@ -1,186 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.net.urlnormalizer.regex; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileFilter; -import java.io.FileInputStream; -import java.io.FileReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.*; - -import org.junit.Assert; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.net.URLNormalizers; -import org.apache.nutch.util.NutchConfiguration; - -/** Unit tests for RegexUrlNormalizer. */ -public class TestRegexURLNormalizer { - private static final Logger LOG = LoggerFactory - .getLogger(TestRegexURLNormalizer.class); - - private RegexURLNormalizer normalizer; - private Configuration conf; - private Map<String, NormalizedURL[]> testData = new HashMap<String, NormalizedURL[]>(); - - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/urlnormalizer-regex/build.xml during plugin compilation. - - public TestRegexURLNormalizer() throws IOException { - normalizer = new RegexURLNormalizer(); - conf = NutchConfiguration.create(); - normalizer.setConf(conf); - File[] configs = new File(sampleDir).listFiles(new FileFilter() { - public boolean accept(File f) { - if (f.getName().endsWith(".xml") - && f.getName().startsWith("regex-normalize-")) - return true; - return false; - } - }); - for (int i = 0; i < configs.length; i++) { - try { - FileReader reader = new FileReader(configs[i]); - String cname = configs[i].getName(); - cname = cname.substring(16, cname.indexOf(".xml")); - normalizer.setConfiguration(reader, cname); - NormalizedURL[] urls = readTestFile(cname); - testData.put(cname, urls); - } catch (Exception e) { - LOG.warn("Could load config from '" + configs[i] + "': " + e.toString()); - } - } - } - - @Test - public void testNormalizerDefault() throws Exception { - normalizeTest((NormalizedURL[]) testData.get(URLNormalizers.SCOPE_DEFAULT), - URLNormalizers.SCOPE_DEFAULT); - } - - @Test - public void testNormalizerScope() throws Exception { - Iterator<String> it = testData.keySet().iterator(); - while (it.hasNext()) { - String scope = it.next(); - normalizeTest((NormalizedURL[]) testData.get(scope), scope); - } - } - - private void normalizeTest(NormalizedURL[] urls, String scope) - throws Exception { - for (int i = 0; i < urls.length; i++) { - String url = urls[i].url; - String normalized = normalizer.normalize(urls[i].url, scope); - String expected = urls[i].expectedURL; - LOG.info("scope: " + scope + " url: " + url + " | normalized: " - + normalized + " | expected: " + expected); - Assert.assertEquals(urls[i].expectedURL, normalized); - } - } - - private void bench(int loops, String scope) { - long start = System.currentTimeMillis(); - try { - NormalizedURL[] expected = (NormalizedURL[]) testData.get(scope); - if (expected == null) - return; - for (int i = 0; i < loops; i++) { - normalizeTest(expected, scope); - } - } catch (Exception e) { - Assert.fail(e.toString()); - } - LOG.info("bench time (" + loops + ") " - + (System.currentTimeMillis() - start) + "ms"); - } - - private static class NormalizedURL { - String url; - String expectedURL; - - public NormalizedURL(String line) { - String[] fields = line.split("\\s+"); - url = fields[0]; - expectedURL = fields[1]; - } - } - - private NormalizedURL[] readTestFile(String scope) throws IOException { - File f = new File(sampleDir, "regex-normalize-" + scope + ".test"); - @SuppressWarnings("resource") - BufferedReader in = new BufferedReader(new InputStreamReader( - new FileInputStream(f), "UTF-8")); - List<NormalizedURL> list = new ArrayList<NormalizedURL>(); - String line; - while ((line = in.readLine()) != null) { - if (line.trim().length() == 0 || line.startsWith("#") - || line.startsWith(" ")) - continue; - list.add(new NormalizedURL(line)); - } - return (NormalizedURL[]) list.toArray(new NormalizedURL[list.size()]); - } - - public static void main(String[] args) throws Exception { - if (args.length == 0) { - System.err.println("TestRegexURLNormalizer [-bench <iter>] <scope>"); - System.exit(-1); - } - boolean bench = false; - int iter = -1; - String scope = null; - for (int i = 0; i < args.length; i++) { - if (args[i].equals("-bench")) { - bench = true; - iter = Integer.parseInt(args[++i]); - } else - scope = args[i]; - } - if (scope == null) { - System.err.println("Missing required scope name."); - System.exit(-1); - } - if (bench && iter < 0) { - System.err.println("Invalid number of iterations: " + iter); - System.exit(-1); - } - TestRegexURLNormalizer test = new TestRegexURLNormalizer(); - NormalizedURL[] urls = (NormalizedURL[]) test.testData.get(scope); - if (urls == null) { - LOG.warn("Missing test data for scope '" + scope - + "', using default scope."); - scope = URLNormalizers.SCOPE_DEFAULT; - urls = (NormalizedURL[]) test.testData.get(scope); - } - if (bench) { - test.bench(iter, scope); - } else { - test.normalizeTest(urls, scope); - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-slash/src/test/java/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-slash/src/test/java/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java b/nutch-plugins/urlnormalizer-slash/src/test/java/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java new file mode 100644 index 0000000..c3585e4 --- /dev/null +++ b/nutch-plugins/urlnormalizer-slash/src/test/java/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.slash; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestSlashURLNormalizer extends TestCase { + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + public void testSlashURLNormalizer() throws Exception { + Configuration conf = NutchConfiguration.create(); + + String slashesFile = SAMPLES + SEPARATOR + "slashes.txt"; + SlashURLNormalizer normalizer = new SlashURLNormalizer(slashesFile); + normalizer.setConf(conf); + + // No change + assertEquals("http://example.org/", normalizer.normalize("http://example.org/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.net/", normalizer.normalize("http://example.net/", URLNormalizers.SCOPE_DEFAULT)); + + // Don't touch base URL's + assertEquals("http://example.org", normalizer.normalize("http://example.org", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.net", normalizer.normalize("http://example.net", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.org/", normalizer.normalize("http://example.org/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.net/", normalizer.normalize("http://example.net/", URLNormalizers.SCOPE_DEFAULT)); + + // Change + assertEquals("http://www.example.org/page/", normalizer.normalize("http://www.example.org/page", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://www.example.net/path/to/something", normalizer.normalize("http://www.example.net/path/to/something/", URLNormalizers.SCOPE_DEFAULT)); + + // No change + assertEquals("http://example.org/buh/", normalizer.normalize("http://example.org/buh/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.net/blaat", normalizer.normalize("http://example.net/blaat", URLNormalizers.SCOPE_DEFAULT)); + + // No change + assertEquals("http://example.nl/buh/", normalizer.normalize("http://example.nl/buh/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.de/blaat", normalizer.normalize("http://example.de/blaat", URLNormalizers.SCOPE_DEFAULT)); + + // Change + assertEquals("http://www.example.org/page/?a=b&c=d", normalizer.normalize("http://www.example.org/page?a=b&c=d", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://www.example.net/path/to/something?a=b&c=d", normalizer.normalize("http://www.example.net/path/to/something/?a=b&c=d", URLNormalizers.SCOPE_DEFAULT)); + + // No change + assertEquals("http://www.example.org/noise.mp3", normalizer.normalize("http://www.example.org/noise.mp3", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://www.example.org/page.html", normalizer.normalize("http://www.example.org/page.html", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://www.example.org/page.shtml", normalizer.normalize("http://www.example.org/page.shtml", URLNormalizers.SCOPE_DEFAULT)); + + // Change + assertEquals("http://www.example.org/this.is.not.an_extension/", normalizer.normalize("http://www.example.org/this.is.not.an_extension", URLNormalizers.SCOPE_DEFAULT)); + } +}
