http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java deleted file mode 100644 index 4224f93..0000000 --- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java +++ /dev/null @@ -1,156 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.tika; - -import org.apache.nutch.parse.HTMLMetaTags; -import org.apache.nutch.parse.tika.HTMLMetaProcessor; - -import java.io.ByteArrayInputStream; -import java.net.URL; - -import org.xml.sax.*; -import org.w3c.dom.*; -import org.apache.html.dom.*; -import org.cyberneko.html.parsers.DOMFragmentParser; -import org.junit.Assert; -import org.junit.Test; - -/** Unit tests for HTMLMetaProcessor. */ -public class TestRobotsMetaProcessor { - - /* - * - * some sample tags: - * - * <meta name="robots" content="index,follow"> <meta name="robots" - * content="noindex,follow"> <meta name="robots" content="index,nofollow"> - * <meta name="robots" content="noindex,nofollow"> - * - * <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> - */ - - public static String[] tests = { - "<html><head><title>test page</title>" - + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " - + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"all\"> " - + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " - + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"none\"> " + "</head><body>" - + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"noindex,nofollow\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"noindex,follow\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"index,nofollow\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"index,follow\"> " - + "<base href=\"http://www.nutch.org/\">" + "</head><body>" - + " some text" + "</body></html>", - - "<html><head><title>test page</title>" + "<meta name=\"robots\"> " - + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>" - + " some text" + "</body></html>", - - }; - - public static final boolean[][] answers = { { true, true, true }, // NONE - { false, false, true }, // all - { true, true, true }, // nOnE - { true, true, false }, // none - { true, true, false }, // noindex,nofollow - { true, false, false }, // noindex,follow - { false, true, false }, // index,nofollow - { false, false, false }, // index,follow - { false, false, false }, // missing! - }; - - private URL[][] currURLsAndAnswers; - - @Test - public void testRobotsMetaProcessor() { - DOMFragmentParser parser = new DOMFragmentParser(); - ; - - try { - currURLsAndAnswers = new URL[][] { - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org/foo/"), - new URL("http://www.nutch.org/") }, - { new URL("http://www.nutch.org"), - new URL("http://www.nutch.org/base/") } }; - } catch (Exception e) { - Assert.assertTrue("couldn't make test URLs!", false); - } - - for (int i = 0; i < tests.length; i++) { - byte[] bytes = tests[i].getBytes(); - - DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); - - try { - parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node); - } catch (Exception e) { - e.printStackTrace(); - } - - HTMLMetaTags robotsMeta = new HTMLMetaTags(); - HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]); - - Assert.assertTrue("got index wrong on test " + i, - robotsMeta.getNoIndex() == answers[i][0]); - Assert.assertTrue("got follow wrong on test " + i, - robotsMeta.getNoFollow() == answers[i][1]); - Assert.assertTrue("got cache wrong on test " + i, - robotsMeta.getNoCache() == answers[i][2]); - Assert - .assertTrue( - "got base href wrong on test " + i + " (got " - + robotsMeta.getBaseHref() + ")", - ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) - || ((robotsMeta.getBaseHref() != null) && robotsMeta - .getBaseHref().equals(currURLsAndAnswers[i][1]))); - - } - } - -}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-zip/src/test/java/org/apache/nutch/parse/zip/TestZipParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-zip/src/test/java/org/apache/nutch/parse/zip/TestZipParser.java b/nutch-plugins/parse-zip/src/test/java/org/apache/nutch/parse/zip/TestZipParser.java new file mode 100644 index 0000000..17e386a --- /dev/null +++ b/nutch-plugins/parse-zip/src/test/java/org/apache/nutch/parse/zip/TestZipParser.java @@ -0,0 +1,71 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.zip; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.junit.Assert; +import org.junit.Test; + +/** + * Based on Unit tests for MSWordParser by John Xing + * + * @author Rohit Kulkarni & Ashish Vaidya + */ +public class TestZipParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + + // Make sure sample files are copied to "test.data" + + private String[] sampleFiles = { "test.zip" }; + + private String expectedText = "textfile.txt This is text file number 1 "; + + @Test + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parse parse; + + Configuration conf = NutchConfiguration.create(); + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get( + content.getUrl()); + Assert.assertTrue(parse.getText().equals(expectedText)); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java b/nutch-plugins/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java deleted file mode 100644 index 17e386a..0000000 --- a/nutch-plugins/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java +++ /dev/null @@ -1,71 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.zip; - -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.junit.Assert; -import org.junit.Test; - -/** - * Based on Unit tests for MSWordParser by John Xing - * - * @author Rohit Kulkarni & Ashish Vaidya - */ -public class TestZipParser { - - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - - // Make sure sample files are copied to "test.data" - - private String[] sampleFiles = { "test.zip" }; - - private String expectedText = "textfile.txt This is text file number 1 "; - - @Test - public void testIt() throws ProtocolException, ParseException { - String urlString; - Protocol protocol; - Content content; - Parse parse; - - Configuration conf = NutchConfiguration.create(); - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get( - content.getUrl()); - Assert.assertTrue(parse.getText().equals(expectedText)); - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java b/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java new file mode 100644 index 0000000..9bd7149 --- /dev/null +++ b/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parsefilter.regex; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; +import junit.framework.TestCase; + +public class TestRegexParseFilter extends TestCase { + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + public void testPositiveFilter() throws Exception { + Configuration conf = NutchConfiguration.create(); + + String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt"; + RegexParseFilter filter = new RegexParseFilter(file); + filter.setConf(conf); + + String url = "http://nutch.apache.org/"; + String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>"; + Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf); + Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData()); + + ParseResult result = ParseResult.createParseResult(url, parse); + result = filter.filter(content, result, null, null); + + Metadata meta = parse.getData().getParseMeta(); + + assertEquals("true", meta.get("first")); + assertEquals("true", meta.get("second")); + } + + public void testNegativeFilter() throws Exception { + Configuration conf = NutchConfiguration.create(); + + String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt"; + RegexParseFilter filter = new RegexParseFilter(file); + filter.setConf(conf); + + String url = "http://nutch.apache.org/"; + String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>"; + Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf); + Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData()); + + ParseResult result = ParseResult.createParseResult(url, parse); + result = filter.filter(content, result, null, null); + + Metadata meta = parse.getData().getParseMeta(); + + assertEquals("false", meta.get("first")); + assertEquals("false", meta.get("second")); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java b/nutch-plugins/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java deleted file mode 100644 index 9bd7149..0000000 --- a/nutch-plugins/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.parsefilter.regex; - -import java.net.MalformedURLException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.util.NutchConfiguration; -import junit.framework.TestCase; - -public class TestRegexParseFilter extends TestCase { - - private final static String SEPARATOR = System.getProperty("file.separator"); - private final static String SAMPLES = System.getProperty("test.data", "."); - - public void testPositiveFilter() throws Exception { - Configuration conf = NutchConfiguration.create(); - - String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt"; - RegexParseFilter filter = new RegexParseFilter(file); - filter.setConf(conf); - - String url = "http://nutch.apache.org/"; - String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>"; - Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf); - Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData()); - - ParseResult result = ParseResult.createParseResult(url, parse); - result = filter.filter(content, result, null, null); - - Metadata meta = parse.getData().getParseMeta(); - - assertEquals("true", meta.get("first")); - assertEquals("true", meta.get("second")); - } - - public void testNegativeFilter() throws Exception { - Configuration conf = NutchConfiguration.create(); - - String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt"; - RegexParseFilter filter = new RegexParseFilter(file); - filter.setConf(conf); - - String url = "http://nutch.apache.org/"; - String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>"; - Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf); - Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData()); - - ParseResult result = ParseResult.createParseResult(url, parse); - result = filter.filter(content, result, null, null); - - Metadata meta = parse.getData().getParseMeta(); - - assertEquals("false", meta.get("first")); - assertEquals("false", meta.get("second")); - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java b/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java new file mode 100644 index 0000000..5f95377 --- /dev/null +++ b/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.file; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; + +// Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/** + * @author mattmann + * @version $Revision$ + * + * <p> + * Unit tests for the {@link File}Protocol. + * </p> + * . + */ +public class TestProtocolFile { + + private String fileSeparator = System.getProperty("file.separator"); + private String sampleDir = System.getProperty("test.data", "."); + + private static final String[] testTextFiles = new String[] { + "testprotocolfile.txt", "testprotocolfile_(encoded).txt", + "testprotocolfile_%28encoded%29.txt" }; + + private static final CrawlDatum datum = new CrawlDatum(); + + private static final String expectedMimeType = "text/plain"; + + private Configuration conf; + + @Before + public void setUp() { + conf = NutchConfiguration.create(); + } + + @Test + public void testSetContentType() throws ProtocolException { + for (String testTextFile : testTextFiles) { + setContentType(testTextFile); + } + } + + /** + * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field. + * + * @since NUTCH-384 + * + */ + public void setContentType(String testTextFile) throws ProtocolException { + String urlString = "file:" + sampleDir + fileSeparator + testTextFile; + Assert.assertNotNull(urlString); + Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); + ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString), + datum); + Assert.assertNotNull(output); + Assert.assertEquals("Status code: [" + output.getStatus().getCode() + + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: [" + + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output + .getStatus().getCode()); + Assert.assertNotNull(output.getContent()); + Assert.assertNotNull(output.getContent().getContentType()); + Assert.assertEquals(expectedMimeType, output.getContent().getContentType()); + Assert.assertNotNull(output.getContent().getMetadata()); + Assert.assertEquals(expectedMimeType, output.getContent().getMetadata() + .get(Response.CONTENT_TYPE)); + + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java b/nutch-plugins/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java deleted file mode 100644 index 5f95377..0000000 --- a/nutch-plugins/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.protocol.file; - -// Hadoop imports -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; - -// Nutch imports -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.ProtocolOutput; -import org.apache.nutch.protocol.ProtocolStatus; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -/** - * @author mattmann - * @version $Revision$ - * - * <p> - * Unit tests for the {@link File}Protocol. - * </p> - * . - */ -public class TestProtocolFile { - - private String fileSeparator = System.getProperty("file.separator"); - private String sampleDir = System.getProperty("test.data", "."); - - private static final String[] testTextFiles = new String[] { - "testprotocolfile.txt", "testprotocolfile_(encoded).txt", - "testprotocolfile_%28encoded%29.txt" }; - - private static final CrawlDatum datum = new CrawlDatum(); - - private static final String expectedMimeType = "text/plain"; - - private Configuration conf; - - @Before - public void setUp() { - conf = NutchConfiguration.create(); - } - - @Test - public void testSetContentType() throws ProtocolException { - for (String testTextFile : testTextFiles) { - setContentType(testTextFile); - } - } - - /** - * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field. - * - * @since NUTCH-384 - * - */ - public void setContentType(String testTextFile) throws ProtocolException { - String urlString = "file:" + sampleDir + fileSeparator + testTextFile; - Assert.assertNotNull(urlString); - Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); - ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString), - datum); - Assert.assertNotNull(output); - Assert.assertEquals("Status code: [" + output.getStatus().getCode() - + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: [" - + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output - .getStatus().getCode()); - Assert.assertNotNull(output.getContent()); - Assert.assertNotNull(output.getContent().getContentType()); - Assert.assertEquals(expectedMimeType, output.getContent().getContentType()); - Assert.assertNotNull(output.getContent().getMetadata()); - Assert.assertEquals(expectedMimeType, output.getContent().getMetadata() - .get(Response.CONTENT_TYPE)); - - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java b/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java new file mode 100644 index 0000000..7dd9e9b --- /dev/null +++ b/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.http; + +import static org.junit.Assert.assertEquals; + +import java.net.URL; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolOutput; +import org.junit.After; +import org.junit.Test; +import org.mortbay.jetty.Server; +import org.mortbay.jetty.nio.SelectChannelConnector; +import org.mortbay.jetty.servlet.Context; +import org.mortbay.jetty.servlet.ServletHolder; + +/** + * Test cases for protocol-http + */ +public class TestProtocolHttp { + private static final String RES_DIR = System.getProperty("test.data", "."); + + private Http http; + private Server server; + private Context root; + private Configuration conf; + private int port; + + public void setUp(boolean redirection) throws Exception { + conf = new Configuration(); + conf.addResource("nutch-default.xml"); + conf.addResource("nutch-site-test.xml"); + + http = new Http(); + http.setConf(conf); + + server = new Server(); + + if (redirection) { + root = new Context(server, "/redirection", Context.SESSIONS); + root.setAttribute("newContextURL", "/redirect"); + } else { + root = new Context(server, "/", Context.SESSIONS); + } + + ServletHolder sh = new ServletHolder( + org.apache.jasper.servlet.JspServlet.class); + root.addServlet(sh, "*.jsp"); + root.setResourceBase(RES_DIR); + } + + @After + public void tearDown() throws Exception { + server.stop(); + } + + @Test + public void testStatusCode() throws Exception { + startServer(47504, false); + fetchPage("/basic-http.jsp", 200); + fetchPage("/redirect301.jsp", 301); + fetchPage("/redirect302.jsp", 302); + fetchPage("/nonexists.html", 404); + fetchPage("/brokenpage.jsp", 500); + } + + @Test + public void testRedirectionJetty() throws Exception { + // Redirection via Jetty + startServer(47503, true); + fetchPage("/redirection", 302); + } + + /** + * Starts the Jetty server at a specified port and redirection parameter. + * + * @param portno + * Port number. + * @param redirection + * whether redirection + */ + private void startServer(int portno, boolean redirection) throws Exception { + port = portno; + setUp(redirection); + SelectChannelConnector connector = new SelectChannelConnector(); + connector.setHost("127.0.0.1"); + connector.setPort(port); + + server.addConnector(connector); + server.start(); + } + + /** + * Fetches the specified <code>page</code> from the local Jetty server and + * checks whether the HTTP response status code matches with the expected + * code. Also use jsp pages for redirection. + * + * @param page + * Page to be fetched. + * @param expectedCode + * HTTP response status code expected while fetching the page. + */ + private void fetchPage(String page, int expectedCode) throws Exception { + URL url = new URL("http", "127.0.0.1", port, page); + CrawlDatum crawlDatum = new CrawlDatum(); + Response response = http.getResponse(url, crawlDatum, true); + ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()), + crawlDatum); + Content content = out.getContent(); + assertEquals("HTTP Status Code for " + url, expectedCode, + response.getCode()); + + if (page.compareTo("/nonexists.html") != 0 + && page.compareTo("/brokenpage.jsp") != 0 + && page.compareTo("/redirection") != 0) { + assertEquals("ContentType " + url, "text/html", + content.getContentType()); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java b/nutch-plugins/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java deleted file mode 100644 index 7dd9e9b..0000000 --- a/nutch-plugins/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.protocol.http; - -import static org.junit.Assert.assertEquals; - -import java.net.URL; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolOutput; -import org.junit.After; -import org.junit.Test; -import org.mortbay.jetty.Server; -import org.mortbay.jetty.nio.SelectChannelConnector; -import org.mortbay.jetty.servlet.Context; -import org.mortbay.jetty.servlet.ServletHolder; - -/** - * Test cases for protocol-http - */ -public class TestProtocolHttp { - private static final String RES_DIR = System.getProperty("test.data", "."); - - private Http http; - private Server server; - private Context root; - private Configuration conf; - private int port; - - public void setUp(boolean redirection) throws Exception { - conf = new Configuration(); - conf.addResource("nutch-default.xml"); - conf.addResource("nutch-site-test.xml"); - - http = new Http(); - http.setConf(conf); - - server = new Server(); - - if (redirection) { - root = new Context(server, "/redirection", Context.SESSIONS); - root.setAttribute("newContextURL", "/redirect"); - } else { - root = new Context(server, "/", Context.SESSIONS); - } - - ServletHolder sh = new ServletHolder( - org.apache.jasper.servlet.JspServlet.class); - root.addServlet(sh, "*.jsp"); - root.setResourceBase(RES_DIR); - } - - @After - public void tearDown() throws Exception { - server.stop(); - } - - @Test - public void testStatusCode() throws Exception { - startServer(47504, false); - fetchPage("/basic-http.jsp", 200); - fetchPage("/redirect301.jsp", 301); - fetchPage("/redirect302.jsp", 302); - fetchPage("/nonexists.html", 404); - fetchPage("/brokenpage.jsp", 500); - } - - @Test - public void testRedirectionJetty() throws Exception { - // Redirection via Jetty - startServer(47503, true); - fetchPage("/redirection", 302); - } - - /** - * Starts the Jetty server at a specified port and redirection parameter. - * - * @param portno - * Port number. - * @param redirection - * whether redirection - */ - private void startServer(int portno, boolean redirection) throws Exception { - port = portno; - setUp(redirection); - SelectChannelConnector connector = new SelectChannelConnector(); - connector.setHost("127.0.0.1"); - connector.setPort(port); - - server.addConnector(connector); - server.start(); - } - - /** - * Fetches the specified <code>page</code> from the local Jetty server and - * checks whether the HTTP response status code matches with the expected - * code. Also use jsp pages for redirection. - * - * @param page - * Page to be fetched. - * @param expectedCode - * HTTP response status code expected while fetching the page. - */ - private void fetchPage(String page, int expectedCode) throws Exception { - URL url = new URL("http", "127.0.0.1", port, page); - CrawlDatum crawlDatum = new CrawlDatum(); - Response response = http.getResponse(url, crawlDatum, true); - ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()), - crawlDatum); - Content content = out.getContent(); - assertEquals("HTTP Status Code for " + url, expectedCode, - response.getCode()); - - if (page.compareTo("/nonexists.html") != 0 - && page.compareTo("/brokenpage.jsp") != 0 - && page.compareTo("/redirection") != 0) { - assertEquals("ContentType " + url, "text/html", - content.getContentType()); - } - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/protocol-httpclient/src/test/java/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-httpclient/src/test/java/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java b/nutch-plugins/protocol-httpclient/src/test/java/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java new file mode 100644 index 0000000..783e5af --- /dev/null +++ b/nutch-plugins/protocol-httpclient/src/test/java/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.httpclient; + +import java.net.URL; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.mortbay.jetty.Server; +import org.mortbay.jetty.bio.SocketConnector; +import org.mortbay.jetty.handler.ContextHandler; +import org.mortbay.jetty.servlet.ServletHandler; +import org.mortbay.jetty.servlet.SessionHandler; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.protocols.Response; + +/** + * Test cases for protocol-httpclient. + */ +public class TestProtocolHttpClient { + + private Server server; + private Configuration conf; + private static final String RES_DIR = System.getProperty("test.data", "."); + private int port; + private Http http = new Http(); + + @Before + public void setUp() throws Exception { + + ContextHandler context = new ContextHandler(); + context.setContextPath("/"); + context.setResourceBase(RES_DIR); + ServletHandler sh = new ServletHandler(); + sh.addServletWithMapping("org.apache.jasper.servlet.JspServlet", "*.jsp"); + context.addHandler(sh); + context.addHandler(new SessionHandler()); + + server = new Server(); + server.addHandler(context); + + conf = new Configuration(); + conf.addResource("nutch-default.xml"); + conf.addResource("nutch-site-test.xml"); + + http = new Http(); + http.setConf(conf); + } + + @After + public void tearDown() throws Exception { + server.stop(); + for (int i = 0; i < 5; i++) { + if (!server.isStopped()) { + Thread.sleep(1000); + } + } + } + + /** + * Tests whether the client can remember cookies. + * + * @throws Exception + * If an error occurs or the test case fails. + */ + @Test + public void testCookies() throws Exception { + startServer(47500); + fetchPage("/cookies.jsp", 200); + fetchPage("/cookies.jsp?cookie=yes", 200); + } + + /** + * Tests that no pre-emptive authorization headers are sent by the client. + * + * @throws Exception + * If an error occurs or the test case fails. + */ + @Test + public void testNoPreemptiveAuth() throws Exception { + startServer(47500); + fetchPage("/noauth.jsp", 200); + } + + /** + * Tests default credentials. + * + * @throws Exception + * If an error occurs or the test case fails. + */ + @Test + public void testDefaultCredentials() throws Exception { + startServer(47502); + fetchPage("/basic.jsp", 200); + } + + /** + * Tests basic authentication scheme for various realms. + * + * @throws Exception + * If an error occurs or the test case fails. + */ + @Test + public void testBasicAuth() throws Exception { + startServer(47500); + fetchPage("/basic.jsp", 200); + fetchPage("/basic.jsp?case=1", 200); + fetchPage("/basic.jsp?case=2", 200); + server.start(); + } + + /** + * Tests that authentication happens for a defined realm and not for other + * realms for a host:port when an extra <code>authscope</code> tag is not + * defined to match all other realms. + * + * @throws Exception + * If an error occurs or the test case fails. + */ + @Test + public void testOtherRealmsNoAuth() throws Exception { + startServer(47501); + fetchPage("/basic.jsp", 200); + fetchPage("/basic.jsp?case=1", 401); + fetchPage("/basic.jsp?case=2", 401); + } + + /** + * Tests Digest authentication scheme. + * + * @throws Exception + * If an error occurs or the test case fails. + */ + @Test + public void testDigestAuth() throws Exception { + startServer(47500); + fetchPage("/digest.jsp", 200); + } + + /** + * Tests NTLM authentication scheme. + * + * @throws Exception + * If an error occurs or the test case fails. + */ + @Test + public void testNtlmAuth() throws Exception { + startServer(47501); + fetchPage("/ntlm.jsp", 200); + } + + /** + * Starts the Jetty server at a specified port. + * + * Will try up to 10 ports to find an available port to use. + * + * @param portno + * Port number. + * @throws Exception + * When an error occurs. + */ + private void startServer(int portno) throws Exception { + SocketConnector listener = new SocketConnector(); + listener.setHost("127.0.0.1"); + server.addConnector(listener); + for (int p = portno; p < portno + 10; p++) { + port = portno; + listener.setPort(port); + try { + server.start(); + break; + } catch (Exception e) { + if (p == portno + 9) { + throw e; + } + } + } + } + + /** + * Fetches the specified <code>page</code> from the local Jetty server and + * checks whether the HTTP response status code matches with the expected + * code. + * + * @param page + * Page to be fetched. + * @param expectedCode + * HTTP response status code expected while fetching the page. + * @throws Exception + * When an error occurs or test case fails. + */ + private void fetchPage(String page, int expectedCode) throws Exception { + URL url = new URL("http", "127.0.0.1", port, page); + Response response = null; + response = http.getResponse(url, new CrawlDatum(), true); + + int code = response.getCode(); + Assert.assertEquals("HTTP Status Code for " + url, expectedCode, code); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java b/nutch-plugins/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java deleted file mode 100644 index 783e5af..0000000 --- a/nutch-plugins/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.protocol.httpclient; - -import java.net.URL; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.mortbay.jetty.Server; -import org.mortbay.jetty.bio.SocketConnector; -import org.mortbay.jetty.handler.ContextHandler; -import org.mortbay.jetty.servlet.ServletHandler; -import org.mortbay.jetty.servlet.SessionHandler; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.net.protocols.Response; - -/** - * Test cases for protocol-httpclient. - */ -public class TestProtocolHttpClient { - - private Server server; - private Configuration conf; - private static final String RES_DIR = System.getProperty("test.data", "."); - private int port; - private Http http = new Http(); - - @Before - public void setUp() throws Exception { - - ContextHandler context = new ContextHandler(); - context.setContextPath("/"); - context.setResourceBase(RES_DIR); - ServletHandler sh = new ServletHandler(); - sh.addServletWithMapping("org.apache.jasper.servlet.JspServlet", "*.jsp"); - context.addHandler(sh); - context.addHandler(new SessionHandler()); - - server = new Server(); - server.addHandler(context); - - conf = new Configuration(); - conf.addResource("nutch-default.xml"); - conf.addResource("nutch-site-test.xml"); - - http = new Http(); - http.setConf(conf); - } - - @After - public void tearDown() throws Exception { - server.stop(); - for (int i = 0; i < 5; i++) { - if (!server.isStopped()) { - Thread.sleep(1000); - } - } - } - - /** - * Tests whether the client can remember cookies. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testCookies() throws Exception { - startServer(47500); - fetchPage("/cookies.jsp", 200); - fetchPage("/cookies.jsp?cookie=yes", 200); - } - - /** - * Tests that no pre-emptive authorization headers are sent by the client. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testNoPreemptiveAuth() throws Exception { - startServer(47500); - fetchPage("/noauth.jsp", 200); - } - - /** - * Tests default credentials. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testDefaultCredentials() throws Exception { - startServer(47502); - fetchPage("/basic.jsp", 200); - } - - /** - * Tests basic authentication scheme for various realms. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testBasicAuth() throws Exception { - startServer(47500); - fetchPage("/basic.jsp", 200); - fetchPage("/basic.jsp?case=1", 200); - fetchPage("/basic.jsp?case=2", 200); - server.start(); - } - - /** - * Tests that authentication happens for a defined realm and not for other - * realms for a host:port when an extra <code>authscope</code> tag is not - * defined to match all other realms. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testOtherRealmsNoAuth() throws Exception { - startServer(47501); - fetchPage("/basic.jsp", 200); - fetchPage("/basic.jsp?case=1", 401); - fetchPage("/basic.jsp?case=2", 401); - } - - /** - * Tests Digest authentication scheme. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testDigestAuth() throws Exception { - startServer(47500); - fetchPage("/digest.jsp", 200); - } - - /** - * Tests NTLM authentication scheme. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testNtlmAuth() throws Exception { - startServer(47501); - fetchPage("/ntlm.jsp", 200); - } - - /** - * Starts the Jetty server at a specified port. - * - * Will try up to 10 ports to find an available port to use. - * - * @param portno - * Port number. - * @throws Exception - * When an error occurs. - */ - private void startServer(int portno) throws Exception { - SocketConnector listener = new SocketConnector(); - listener.setHost("127.0.0.1"); - server.addConnector(listener); - for (int p = portno; p < portno + 10; p++) { - port = portno; - listener.setPort(port); - try { - server.start(); - break; - } catch (Exception e) { - if (p == portno + 9) { - throw e; - } - } - } - } - - /** - * Fetches the specified <code>page</code> from the local Jetty server and - * checks whether the HTTP response status code matches with the expected - * code. - * - * @param page - * Page to be fetched. - * @param expectedCode - * HTTP response status code expected while fetching the page. - * @throws Exception - * When an error occurs or test case fails. - */ - private void fetchPage(String page, int expectedCode) throws Exception { - URL url = new URL("http", "127.0.0.1", port, page); - Response response = null; - response = http.getResponse(url, new CrawlDatum(), true); - - int code = response.getCode(); - Assert.assertEquals("HTTP Status Code for " + url, expectedCode, code); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/subcollection/src/test/java/org/apache/nutch/collection/TestSubcollection.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/subcollection/src/test/java/org/apache/nutch/collection/TestSubcollection.java b/nutch-plugins/subcollection/src/test/java/org/apache/nutch/collection/TestSubcollection.java new file mode 100644 index 0000000..a2d2772 --- /dev/null +++ b/nutch-plugins/subcollection/src/test/java/org/apache/nutch/collection/TestSubcollection.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.collection; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.util.Collection; + +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +public class TestSubcollection { + + /** + * Test filtering logic + * + * @throws Exception + */ + @Test + public void testFilter() throws Exception { + Subcollection sc = new Subcollection(NutchConfiguration.create()); + sc.setWhiteList("www.nutch.org\nwww.apache.org"); + sc.setBlackList("jpg\nwww.apache.org/zecret/"); + + // matches whitelist + Assert.assertEquals("http://www.apache.org/index.html", + sc.filter("http://www.apache.org/index.html")); + + // matches blacklist + Assert.assertEquals(null, + sc.filter("http://www.apache.org/zecret/index.html")); + Assert.assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg")); + + // no match + Assert.assertEquals(null, sc.filter("http://www.google.com/")); + } + + @Test + public void testInput() { + StringBuffer xml = new StringBuffer(); + xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); + xml.append("<!-- just a comment -->"); + xml.append("<subcollections>"); + xml.append("<subcollection>"); + xml.append("<name>nutch collection</name>"); + xml.append("<id>nutch</id>"); + xml.append("<whitelist>"); + xml.append("http://lucene.apache.org/nutch/\n"); + xml.append("http://wiki.apache.org/nutch/\n"); + xml.append("</whitelist>"); + xml.append("<blacklist>"); + xml.append("http://www.xxx.yyy\n"); + xml.append("</blacklist>"); + xml.append("</subcollection>"); + xml.append("</subcollections>"); + + InputStream is = new ByteArrayInputStream(xml.toString().getBytes()); + + CollectionManager cm = new CollectionManager(); + cm.parse(is); + + Collection<?> c = cm.getAll(); + + // test that size matches + Assert.assertEquals(1, c.size()); + + Subcollection collection = (Subcollection) c.toArray()[0]; + + // test collection id + Assert.assertEquals("nutch", collection.getId()); + + // test collection name + Assert.assertEquals("nutch collection", collection.getName()); + + // test whitelist + Assert.assertEquals(2, collection.whiteList.size()); + + String wlUrl = (String) collection.whiteList.get(0); + Assert.assertEquals("http://lucene.apache.org/nutch/", wlUrl); + + wlUrl = (String) collection.whiteList.get(1); + Assert.assertEquals("http://wiki.apache.org/nutch/", wlUrl); + + // matches whitelist + Assert.assertEquals("http://lucene.apache.org/nutch/", + collection.filter("http://lucene.apache.org/nutch/")); + + // test blacklist + Assert.assertEquals(1, collection.blackList.size()); + + String blUrl = (String) collection.blackList.get(0); + Assert.assertEquals("http://www.xxx.yyy", blUrl); + + // no match + Assert.assertEquals(null, collection.filter("http://www.google.com/")); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java b/nutch-plugins/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java deleted file mode 100644 index a2d2772..0000000 --- a/nutch-plugins/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.collection; - -import java.io.ByteArrayInputStream; -import java.io.InputStream; -import java.util.Collection; - -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -public class TestSubcollection { - - /** - * Test filtering logic - * - * @throws Exception - */ - @Test - public void testFilter() throws Exception { - Subcollection sc = new Subcollection(NutchConfiguration.create()); - sc.setWhiteList("www.nutch.org\nwww.apache.org"); - sc.setBlackList("jpg\nwww.apache.org/zecret/"); - - // matches whitelist - Assert.assertEquals("http://www.apache.org/index.html", - sc.filter("http://www.apache.org/index.html")); - - // matches blacklist - Assert.assertEquals(null, - sc.filter("http://www.apache.org/zecret/index.html")); - Assert.assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg")); - - // no match - Assert.assertEquals(null, sc.filter("http://www.google.com/")); - } - - @Test - public void testInput() { - StringBuffer xml = new StringBuffer(); - xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); - xml.append("<!-- just a comment -->"); - xml.append("<subcollections>"); - xml.append("<subcollection>"); - xml.append("<name>nutch collection</name>"); - xml.append("<id>nutch</id>"); - xml.append("<whitelist>"); - xml.append("http://lucene.apache.org/nutch/\n"); - xml.append("http://wiki.apache.org/nutch/\n"); - xml.append("</whitelist>"); - xml.append("<blacklist>"); - xml.append("http://www.xxx.yyy\n"); - xml.append("</blacklist>"); - xml.append("</subcollection>"); - xml.append("</subcollections>"); - - InputStream is = new ByteArrayInputStream(xml.toString().getBytes()); - - CollectionManager cm = new CollectionManager(); - cm.parse(is); - - Collection<?> c = cm.getAll(); - - // test that size matches - Assert.assertEquals(1, c.size()); - - Subcollection collection = (Subcollection) c.toArray()[0]; - - // test collection id - Assert.assertEquals("nutch", collection.getId()); - - // test collection name - Assert.assertEquals("nutch collection", collection.getName()); - - // test whitelist - Assert.assertEquals(2, collection.whiteList.size()); - - String wlUrl = (String) collection.whiteList.get(0); - Assert.assertEquals("http://lucene.apache.org/nutch/", wlUrl); - - wlUrl = (String) collection.whiteList.get(1); - Assert.assertEquals("http://wiki.apache.org/nutch/", wlUrl); - - // matches whitelist - Assert.assertEquals("http://lucene.apache.org/nutch/", - collection.filter("http://lucene.apache.org/nutch/")); - - // test blacklist - Assert.assertEquals(1, collection.blackList.size()); - - String blUrl = (String) collection.blackList.get(0); - Assert.assertEquals("http://www.xxx.yyy", blUrl); - - // no match - Assert.assertEquals(null, collection.filter("http://www.google.com/")); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-automaton/src/test/java/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/test/java/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java b/nutch-plugins/urlfilter-automaton/src/test/java/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java new file mode 100644 index 0000000..a70a6b6 --- /dev/null +++ b/nutch-plugins/urlfilter-automaton/src/test/java/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java @@ -0,0 +1,56 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.automaton; + +// JDK imports +import java.io.IOException; +import java.io.Reader; + +import org.apache.nutch.net.*; +// Nutch imports +import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest; +import org.junit.Assert; +import org.junit.Test; + +/** + * JUnit based test of class <code>AutomatonURLFilter</code>. + * + * @author Jérôme Charron + */ +public class TestAutomatonURLFilter extends RegexURLFilterBaseTest { + + protected URLFilter getURLFilter(Reader rules) { + try { + return new AutomatonURLFilter(rules); + } catch (IOException e) { + Assert.fail(e.toString()); + return null; + } + } + + @Test + public void test() { + test("WholeWebCrawling"); + test("IntranetCrawling"); + bench(50, "Benchmarks"); + bench(100, "Benchmarks"); + bench(200, "Benchmarks"); + bench(400, "Benchmarks"); + bench(800, "Benchmarks"); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java b/nutch-plugins/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java deleted file mode 100644 index a70a6b6..0000000 --- a/nutch-plugins/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.urlfilter.automaton; - -// JDK imports -import java.io.IOException; -import java.io.Reader; - -import org.apache.nutch.net.*; -// Nutch imports -import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest; -import org.junit.Assert; -import org.junit.Test; - -/** - * JUnit based test of class <code>AutomatonURLFilter</code>. - * - * @author Jérôme Charron - */ -public class TestAutomatonURLFilter extends RegexURLFilterBaseTest { - - protected URLFilter getURLFilter(Reader rules) { - try { - return new AutomatonURLFilter(rules); - } catch (IOException e) { - Assert.fail(e.toString()); - return null; - } - } - - @Test - public void test() { - test("WholeWebCrawling"); - test("IntranetCrawling"); - bench(50, "Benchmarks"); - bench(100, "Benchmarks"); - bench(200, "Benchmarks"); - bench(400, "Benchmarks"); - bench(800, "Benchmarks"); - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java b/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java new file mode 100644 index 0000000..0be1e31 --- /dev/null +++ b/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.domain; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +public class TestDomainURLFilter { + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + @Test + public void testFilter() throws Exception { + + String domainFile = SAMPLES + SEPARATOR + "hosts.txt"; + Configuration conf = NutchConfiguration.create(); + DomainURLFilter domainFilter = new DomainURLFilter(domainFile); + domainFilter.setConf(conf); + Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org")); + Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org")); + Assert.assertNotNull(domainFilter.filter("http://www.apache.org")); + Assert.assertNull(domainFilter.filter("http://www.google.com")); + Assert.assertNull(domainFilter.filter("http://mail.yahoo.com")); + Assert.assertNotNull(domainFilter.filter("http://www.foobar.net")); + Assert.assertNotNull(domainFilter.filter("http://www.foobas.net")); + Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com")); + Assert.assertNotNull(domainFilter.filter("http://www.foobar.be")); + Assert.assertNull(domainFilter.filter("http://www.adobe.com")); + } + + @Test + public void testNoFilter() throws Exception { + // https://issues.apache.org/jira/browse/NUTCH-2189 + String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt"; + Configuration conf = NutchConfiguration.create(); + DomainURLFilter domainFilter = new DomainURLFilter(domainFile); + domainFilter.setConf(conf); + Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org")); + Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org")); + Assert.assertNotNull(domainFilter.filter("http://www.apache.org")); + Assert.assertNotNull(domainFilter.filter("http://www.google.com")); + Assert.assertNotNull(domainFilter.filter("http://mail.yahoo.com")); + Assert.assertNotNull(domainFilter.filter("http://www.foobar.net")); + Assert.assertNotNull(domainFilter.filter("http://www.foobas.net")); + Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com")); + Assert.assertNotNull(domainFilter.filter("http://www.foobar.be")); + Assert.assertNotNull(domainFilter.filter("http://www.adobe.com")); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java b/nutch-plugins/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java deleted file mode 100644 index 0be1e31..0000000 --- a/nutch-plugins/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.urlfilter.domain; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -public class TestDomainURLFilter { - - private final static String SEPARATOR = System.getProperty("file.separator"); - private final static String SAMPLES = System.getProperty("test.data", "."); - - @Test - public void testFilter() throws Exception { - - String domainFile = SAMPLES + SEPARATOR + "hosts.txt"; - Configuration conf = NutchConfiguration.create(); - DomainURLFilter domainFilter = new DomainURLFilter(domainFile); - domainFilter.setConf(conf); - Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org")); - Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org")); - Assert.assertNotNull(domainFilter.filter("http://www.apache.org")); - Assert.assertNull(domainFilter.filter("http://www.google.com")); - Assert.assertNull(domainFilter.filter("http://mail.yahoo.com")); - Assert.assertNotNull(domainFilter.filter("http://www.foobar.net")); - Assert.assertNotNull(domainFilter.filter("http://www.foobas.net")); - Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com")); - Assert.assertNotNull(domainFilter.filter("http://www.foobar.be")); - Assert.assertNull(domainFilter.filter("http://www.adobe.com")); - } - - @Test - public void testNoFilter() throws Exception { - // https://issues.apache.org/jira/browse/NUTCH-2189 - String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt"; - Configuration conf = NutchConfiguration.create(); - DomainURLFilter domainFilter = new DomainURLFilter(domainFile); - domainFilter.setConf(conf); - Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org")); - Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org")); - Assert.assertNotNull(domainFilter.filter("http://www.apache.org")); - Assert.assertNotNull(domainFilter.filter("http://www.google.com")); - Assert.assertNotNull(domainFilter.filter("http://mail.yahoo.com")); - Assert.assertNotNull(domainFilter.filter("http://www.foobar.net")); - Assert.assertNotNull(domainFilter.filter("http://www.foobas.net")); - Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com")); - Assert.assertNotNull(domainFilter.filter("http://www.foobar.be")); - Assert.assertNotNull(domainFilter.filter("http://www.adobe.com")); - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java b/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java new file mode 100644 index 0000000..d253867 --- /dev/null +++ b/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.domainblacklist; + +import org.junit.Assert; +import org.junit.Test; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +public class TestDomainBlacklistURLFilter { + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + @Test + public void testFilter() throws Exception { + + String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt"; + Configuration conf = NutchConfiguration.create(); + DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter( + domainBlacklistFile); + domainBlacklistFilter.setConf(conf); + Assert.assertNull(domainBlacklistFilter.filter("http://lucene.apache.org")); + Assert.assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org")); + Assert.assertNull(domainBlacklistFilter.filter("http://www.apache.org")); + Assert.assertNotNull(domainBlacklistFilter.filter("http://www.google.com")); + Assert.assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com")); + Assert.assertNull(domainBlacklistFilter.filter("http://www.foobar.net")); + Assert.assertNull(domainBlacklistFilter.filter("http://www.foobas.net")); + Assert.assertNull(domainBlacklistFilter.filter("http://www.yahoo.com")); + Assert.assertNull(domainBlacklistFilter.filter("http://www.foobar.be")); + Assert.assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com")); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java b/nutch-plugins/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java deleted file mode 100644 index d253867..0000000 --- a/nutch-plugins/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.urlfilter.domainblacklist; - -import org.junit.Assert; -import org.junit.Test; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; - -public class TestDomainBlacklistURLFilter { - - private final static String SEPARATOR = System.getProperty("file.separator"); - private final static String SAMPLES = System.getProperty("test.data", "."); - - @Test - public void testFilter() throws Exception { - - String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt"; - Configuration conf = NutchConfiguration.create(); - DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter( - domainBlacklistFile); - domainBlacklistFilter.setConf(conf); - Assert.assertNull(domainBlacklistFilter.filter("http://lucene.apache.org")); - Assert.assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org")); - Assert.assertNull(domainBlacklistFilter.filter("http://www.apache.org")); - Assert.assertNotNull(domainBlacklistFilter.filter("http://www.google.com")); - Assert.assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com")); - Assert.assertNull(domainBlacklistFilter.filter("http://www.foobar.net")); - Assert.assertNull(domainBlacklistFilter.filter("http://www.foobas.net")); - Assert.assertNull(domainBlacklistFilter.filter("http://www.yahoo.com")); - Assert.assertNull(domainBlacklistFilter.filter("http://www.foobar.be")); - Assert.assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com")); - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java b/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java new file mode 100644 index 0000000..b7a7ce4 --- /dev/null +++ b/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.prefix; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; +import junit.textui.TestRunner; + +import java.io.IOException; + + +/** + * JUnit test for <code>PrefixURLFilter</code>. + * + * @author Talat Uyarer + * @author Cihad Guzel + */ +public class TestPrefixURLFilter extends TestCase { + private static final String prefixes = + "# this is a comment\n" + + "\n" + + "http://\n" + + "https://\n" + + "file://\n" + + "ftp://\n"; + + private static final String[] urls = new String[] { + "http://www.example.com/", + "https://www.example.com/", + "ftp://www.example.com/", + "file://www.example.com/", + "abcd://www.example.com/", + "www.example.com/", + }; + + private static String[] urlsModeAccept = new String[] { + urls[0], + urls[1], + urls[2], + urls[3], + null, + null + }; + + private PrefixURLFilter filter = null; + + public static Test suite() { + return new TestSuite(TestPrefixURLFilter.class); + } + + public static void main(String[] args) { + TestRunner.run(suite()); + } + + public void setUp() throws IOException { + filter = new PrefixURLFilter(prefixes); + } + + public void testModeAccept() { + for (int i = 0; i < urls.length; i++) { + assertTrue(urlsModeAccept[i] == filter.filter(urls[i])); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java b/nutch-plugins/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java deleted file mode 100644 index b7a7ce4..0000000 --- a/nutch-plugins/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.urlfilter.prefix; - -import junit.framework.Test; -import junit.framework.TestCase; -import junit.framework.TestSuite; -import junit.textui.TestRunner; - -import java.io.IOException; - - -/** - * JUnit test for <code>PrefixURLFilter</code>. - * - * @author Talat Uyarer - * @author Cihad Guzel - */ -public class TestPrefixURLFilter extends TestCase { - private static final String prefixes = - "# this is a comment\n" + - "\n" + - "http://\n" + - "https://\n" + - "file://\n" + - "ftp://\n"; - - private static final String[] urls = new String[] { - "http://www.example.com/", - "https://www.example.com/", - "ftp://www.example.com/", - "file://www.example.com/", - "abcd://www.example.com/", - "www.example.com/", - }; - - private static String[] urlsModeAccept = new String[] { - urls[0], - urls[1], - urls[2], - urls[3], - null, - null - }; - - private PrefixURLFilter filter = null; - - public static Test suite() { - return new TestSuite(TestPrefixURLFilter.class); - } - - public static void main(String[] args) { - TestRunner.run(suite()); - } - - public void setUp() throws IOException { - filter = new PrefixURLFilter(prefixes); - } - - public void testModeAccept() { - for (int i = 0; i < urls.length; i++) { - assertTrue(urlsModeAccept[i] == filter.filter(urls[i])); - } - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java b/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java new file mode 100644 index 0000000..b86181e --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.regex; + +// JDK imports +import java.io.IOException; +import java.io.Reader; + +import org.apache.nutch.net.*; +// Nutch imports +import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest; +import org.junit.Assert; +import org.junit.Test; + +/** + * JUnit based test of class <code>RegexURLFilter</code>. + * + * @author Jérôme Charron + */ +public class TestRegexURLFilter extends RegexURLFilterBaseTest { + + protected URLFilter getURLFilter(Reader rules) { + try { + return new RegexURLFilter(rules); + } catch (IOException e) { + Assert.fail(e.toString()); + return null; + } + } + + @Test + public void test() { + test("WholeWebCrawling"); + test("IntranetCrawling"); + bench(50, "Benchmarks"); + bench(100, "Benchmarks"); + bench(200, "Benchmarks"); + bench(400, "Benchmarks"); + bench(800, "Benchmarks"); + } + + @Test + public void test1838() { + test("nutch1838"); + } + +}
