Moved test sources to maven standard directory
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/20d28406 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/20d28406 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/20d28406 Branch: refs/heads/NUTCH-2292 Commit: 20d284068bdb918b0eea1a614644c5773bb42a12 Parents: ffa1678 Author: Thamme Gowda <[email protected]> Authored: Tue Jul 5 15:21:52 2016 -0700 Committer: Thamme Gowda <[email protected]> Committed: Tue Jul 5 15:21:52 2016 -0700 ---------------------------------------------------------------------- .../nutch/TestCCParseFilter.java | 73 +++ .../nutch/TestCCParseFilter.java | 73 --- .../apache/nutch/parse/feed/TestFeedParser.java | 124 +++++ .../apache/nutch/parse/feed/TestFeedParser.java | 124 ----- .../anchor/TestAnchorIndexingFilter.java | 67 +++ .../anchor/TestAnchorIndexingFilter.java | 67 --- .../indexer/basic/TestBasicIndexingFilter.java | 99 ++++ .../indexer/basic/TestBasicIndexingFilter.java | 99 ---- .../indexer/links/TestLinksIndexingFilter.java | 218 +++++++++ .../org/apache/nutch/parse/TestOutlinks.java | 54 +++ .../indexer/links/TestLinksIndexingFilter.java | 218 --------- .../org/apache/nutch/parse/TestOutlinks.java | 54 --- .../indexer/more/TestMoreIndexingFilter.java | 123 +++++ .../indexer/more/TestMoreIndexingFilter.java | 123 ----- .../nutch/indexer/replace/TestIndexReplace.java | 456 +++++++++++++++++++ .../nutch/indexer/replace/TestIndexReplace.java | 456 ------------------- .../staticfield/TestStaticFieldIndexerTest.java | 194 ++++++++ .../staticfield/TestStaticFieldIndexerTest.java | 194 -------- .../analysis/lang/TestHTMLLanguageParser.java | 149 ++++++ .../java/org/apache/nutch/analysis/lang/da.test | 108 +++++ .../java/org/apache/nutch/analysis/lang/de.test | 104 +++++ .../java/org/apache/nutch/analysis/lang/el.test | 109 +++++ .../java/org/apache/nutch/analysis/lang/en.test | 105 +++++ .../java/org/apache/nutch/analysis/lang/es.test | 107 +++++ .../java/org/apache/nutch/analysis/lang/fi.test | 106 +++++ .../java/org/apache/nutch/analysis/lang/fr.test | 105 +++++ .../java/org/apache/nutch/analysis/lang/it.test | 109 +++++ .../java/org/apache/nutch/analysis/lang/nl.test | 105 +++++ .../java/org/apache/nutch/analysis/lang/pt.test | 105 +++++ .../java/org/apache/nutch/analysis/lang/sv.test | 108 +++++ .../nutch/analysis/lang/test-referencial.txt | 10 + .../analysis/lang/TestHTMLLanguageParser.java | 149 ------ .../test/org/apache/nutch/analysis/lang/da.test | 108 ----- .../test/org/apache/nutch/analysis/lang/de.test | 104 ----- .../test/org/apache/nutch/analysis/lang/el.test | 109 ----- .../test/org/apache/nutch/analysis/lang/en.test | 105 ----- .../test/org/apache/nutch/analysis/lang/es.test | 107 ----- .../test/org/apache/nutch/analysis/lang/fi.test | 106 ----- .../test/org/apache/nutch/analysis/lang/fr.test | 105 ----- .../test/org/apache/nutch/analysis/lang/it.test | 109 ----- .../test/org/apache/nutch/analysis/lang/nl.test | 105 ----- .../test/org/apache/nutch/analysis/lang/pt.test | 105 ----- .../test/org/apache/nutch/analysis/lang/sv.test | 108 ----- .../nutch/analysis/lang/test-referencial.txt | 10 - .../protocol/http/api/TestRobotRulesParser.java | 123 +++++ .../protocol/http/api/TestRobotRulesParser.java | 123 ----- .../filter/MimeTypeIndexingFilterTest.java | 114 +++++ .../filter/MimeTypeIndexingFilterTest.java | 114 ----- .../apache/nutch/parse/ext/TestExtParser.java | 130 ++++++ .../apache/nutch/parse/ext/TestExtParser.java | 130 ------ .../nutch/parse/html/TestDOMContentUtils.java | 347 ++++++++++++++ .../apache/nutch/parse/html/TestHtmlParser.java | 122 +++++ .../parse/html/TestRobotsMetaProcessor.java | 155 +++++++ .../nutch/parse/html/TestDOMContentUtils.java | 347 -------------- .../apache/nutch/parse/html/TestHtmlParser.java | 122 ----- .../parse/html/TestRobotsMetaProcessor.java | 155 ------- .../nutch/parse/metatags/TestMetatagParser.java | 104 +++++ .../nutch/parse/metatags/TestMetatagParser.java | 104 ----- .../nutch/parse/replace/TestParseReplace.java | 68 +++ .../nutch/parse/replace/TestParseReplace.java | 68 --- .../apache/nutch/parse/swf/TestSWFParser.java | 94 ++++ .../apache/nutch/parse/swf/TestSWFParser.java | 94 ---- .../apache/nutch/tika/TestDOMContentUtils.java | 337 ++++++++++++++ .../org/apache/nutch/tika/TestFeedParser.java | 121 +++++ .../apache/nutch/tika/TestImageMetadata.java | 67 +++ .../org/apache/nutch/tika/TestMSWordParser.java | 92 ++++ .../org/apache/nutch/tika/TestOOParser.java | 107 +++++ .../org/apache/nutch/tika/TestPdfParser.java | 73 +++ .../org/apache/nutch/tika/TestRTFParser.java | 81 ++++ .../nutch/tika/TestRobotsMetaProcessor.java | 156 +++++++ .../apache/nutch/tika/TestDOMContentUtils.java | 337 -------------- .../org/apache/nutch/tika/TestFeedParser.java | 121 ----- .../apache/nutch/tika/TestImageMetadata.java | 67 --- .../org/apache/nutch/tika/TestMSWordParser.java | 92 ---- .../org/apache/nutch/tika/TestOOParser.java | 107 ----- .../org/apache/nutch/tika/TestPdfParser.java | 73 --- .../org/apache/nutch/tika/TestRTFParser.java | 81 ---- .../nutch/tika/TestRobotsMetaProcessor.java | 156 ------- .../apache/nutch/parse/zip/TestZipParser.java | 71 +++ .../apache/nutch/parse/zip/TestZipParser.java | 71 --- .../parsefilter/regex/TestRegexParseFilter.java | 77 ++++ .../parsefilter/regex/TestRegexParseFilter.java | 77 ---- .../nutch/protocol/file/TestProtocolFile.java | 99 ++++ .../nutch/protocol/file/TestProtocolFile.java | 99 ---- .../nutch/protocol/http/TestProtocolHttp.java | 140 ++++++ .../nutch/protocol/http/TestProtocolHttp.java | 140 ------ .../httpclient/TestProtocolHttpClient.java | 217 +++++++++ .../httpclient/TestProtocolHttpClient.java | 217 --------- .../nutch/collection/TestSubcollection.java | 112 +++++ .../nutch/collection/TestSubcollection.java | 112 ----- .../automaton/TestAutomatonURLFilter.java | 56 +++ .../automaton/TestAutomatonURLFilter.java | 56 --- .../urlfilter/domain/TestDomainURLFilter.java | 67 +++ .../urlfilter/domain/TestDomainURLFilter.java | 67 --- .../TestDomainBlacklistURLFilter.java | 49 ++ .../TestDomainBlacklistURLFilter.java | 49 -- .../urlfilter/prefix/TestPrefixURLFilter.java | 79 ++++ .../urlfilter/prefix/TestPrefixURLFilter.java | 79 ---- .../urlfilter/regex/TestRegexURLFilter.java | 61 +++ .../urlfilter/regex/TestRegexURLFilter.java | 61 --- .../urlfilter/suffix/TestSuffixURLFilter.java | 123 +++++ .../urlfilter/suffix/TestSuffixURLFilter.java | 123 ----- .../urlfilter/validator/TestUrlValidator.java | 79 ++++ .../urlfilter/validator/TestUrlValidator.java | 79 ---- .../ajax/TestAjaxURLNormalizer.java | 67 +++ .../ajax/TestAjaxURLNormalizer.java | 67 --- .../basic/TestBasicURLNormalizer.java | 175 +++++++ .../basic/TestBasicURLNormalizer.java | 175 ------- .../host/TestHostURLNormalizer.java | 57 +++ .../host/TestHostURLNormalizer.java | 57 --- .../pass/TestPassURLNormalizer.java | 45 ++ .../pass/TestPassURLNormalizer.java | 45 -- .../protocol/TestProtocolURLNormalizer.java | 55 +++ .../protocol/TestProtocolURLNormalizer.java | 55 --- .../TestQuerystringURLNormalizer.java | 49 ++ .../TestQuerystringURLNormalizer.java | 49 -- .../regex/TestRegexURLNormalizer.java | 186 ++++++++ .../regex/TestRegexURLNormalizer.java | 186 -------- .../slash/TestSlashURLNormalizer.java | 73 +++ .../slash/TestSlashURLNormalizer.java | 73 --- 120 files changed, 6966 insertions(+), 6966 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java b/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java new file mode 100755 index 0000000..41be9ed --- /dev/null +++ b/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.creativecommons.nutch; + +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +import java.io.*; + +public class TestCCParseFilter { + + private static final File testDir = new File(System.getProperty("test.input")); + + @Test + public void testPages() throws Exception { + pageTest(new File(testDir, "anchor.html"), "http://foo.com/", + "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null); + // Tika returns <a> whereas parse-html returns <rel> + // check later + pageTest(new File(testDir, "rel.html"), "http://foo.com/", + "http://creativecommons.org/licenses/by-nc/2.0", "rel", null); + // Tika returns <a> whereas parse-html returns <rdf> + // check later + pageTest(new File(testDir, "rdf.html"), "http://foo.com/", + "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text"); + } + + public void pageTest(File file, String url, String license, String location, + String type) throws Exception { + + String contentType = "text/html"; + InputStream in = new FileInputStream(file); + ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length()); + byte[] buffer = new byte[1024]; + int i; + while ((i = in.read(buffer)) != -1) { + out.write(buffer, 0, i); + } + in.close(); + byte[] bytes = out.toByteArray(); + Configuration conf = NutchConfiguration.create(); + + Content content = new Content(url, url, bytes, contentType, new Metadata(), + conf); + Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); + + Metadata metadata = parse.getData().getParseMeta(); + Assert.assertEquals(license, metadata.get("License-Url")); + Assert.assertEquals(location, metadata.get("License-Location")); + Assert.assertEquals(type, metadata.get("Work-Type")); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java b/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java deleted file mode 100755 index 41be9ed..0000000 --- a/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.creativecommons.nutch; - -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.protocol.Content; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -import java.io.*; - -public class TestCCParseFilter { - - private static final File testDir = new File(System.getProperty("test.input")); - - @Test - public void testPages() throws Exception { - pageTest(new File(testDir, "anchor.html"), "http://foo.com/", - "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null); - // Tika returns <a> whereas parse-html returns <rel> - // check later - pageTest(new File(testDir, "rel.html"), "http://foo.com/", - "http://creativecommons.org/licenses/by-nc/2.0", "rel", null); - // Tika returns <a> whereas parse-html returns <rdf> - // check later - pageTest(new File(testDir, "rdf.html"), "http://foo.com/", - "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text"); - } - - public void pageTest(File file, String url, String license, String location, - String type) throws Exception { - - String contentType = "text/html"; - InputStream in = new FileInputStream(file); - ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length()); - byte[] buffer = new byte[1024]; - int i; - while ((i = in.read(buffer)) != -1) { - out.write(buffer, 0, i); - } - in.close(); - byte[] bytes = out.toByteArray(); - Configuration conf = NutchConfiguration.create(); - - Content content = new Content(url, url, bytes, contentType, new Metadata(), - conf); - Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); - - Metadata metadata = parse.getData().getParseMeta(); - Assert.assertEquals(license, metadata.get("License-Url")); - Assert.assertEquals(location, metadata.get("License-Location")); - Assert.assertEquals(type, metadata.get("Work-Type")); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java b/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java new file mode 100644 index 0000000..36c8739 --- /dev/null +++ b/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.feed; + +// JDK imports +import java.util.Iterator; +import java.util.Map; + +import org.junit.Assert; +import org.junit.Test; +// APACHE imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.ProtocolNotFound; +import org.apache.nutch.util.NutchConfiguration; + +/** + * + * @author mattmann + * + * Test Suite for the {@link FeedParser}. + * + */ +public class TestFeedParser { + + private String fileSeparator = System.getProperty("file.separator"); + + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/feed/build.xml during plugin compilation. + + private String[] sampleFiles = { "rsstest.rss" }; + + public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class + .getName()); + + /** + * Calls the {@link FeedParser} on a sample RSS file and checks that there are + * 3 {@link ParseResult} entries including the below 2 links: + * <ul> + * <li>http://www-scf.usc.edu/~mattmann/</li> + * <li>http://www.nutch.org</li> + * </ul> + * + * + * @throws ProtocolNotFound + * If the {@link Protocol}Layer cannot be loaded (required to fetch + * the {@link Content} for the RSS file). + * @throws ParseException + * If the {@link Parser}Layer cannot be loaded. + */ + @Test + public void testParseFetchChannel() throws ProtocolNotFound, ParseException { + String urlString; + Protocol protocol; + Content content; + ParseResult parseResult; + + Configuration conf = NutchConfiguration.create(); + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + urlString = urlString.replace('\\', '/'); + + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + + parseResult = new ParseUtil(conf).parseByExtensionId("feed", content); + + Assert.assertEquals(3, parseResult.size()); + + boolean hasLink1 = false, hasLink2 = false, hasLink3 = false; + + for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j + .hasNext();) { + Map.Entry<Text, Parse> entry = j.next(); + if (entry.getKey().toString() + .equals("http://www-scf.usc.edu/~mattmann/")) { + hasLink1 = true; + } else if (entry.getKey().toString().equals("http://www.nutch.org/")) { + hasLink2 = true; + } else if (entry.getKey().toString().equals(urlString)) { + hasLink3 = true; + } + + Assert.assertNotNull(entry.getValue()); + Assert.assertNotNull(entry.getValue().getData()); + } + + if (!hasLink1 || !hasLink2 || !hasLink3) { + Assert.fail("Outlinks read from sample rss file are not correct!"); + } + } + + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java b/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java deleted file mode 100644 index 36c8739..0000000 --- a/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java +++ /dev/null @@ -1,124 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.feed; - -// JDK imports -import java.util.Iterator; -import java.util.Map; - -import org.junit.Assert; -import org.junit.Test; -// APACHE imports -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseException; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.ProtocolNotFound; -import org.apache.nutch.util.NutchConfiguration; - -/** - * - * @author mattmann - * - * Test Suite for the {@link FeedParser}. - * - */ -public class TestFeedParser { - - private String fileSeparator = System.getProperty("file.separator"); - - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/feed/build.xml during plugin compilation. - - private String[] sampleFiles = { "rsstest.rss" }; - - public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class - .getName()); - - /** - * Calls the {@link FeedParser} on a sample RSS file and checks that there are - * 3 {@link ParseResult} entries including the below 2 links: - * <ul> - * <li>http://www-scf.usc.edu/~mattmann/</li> - * <li>http://www.nutch.org</li> - * </ul> - * - * - * @throws ProtocolNotFound - * If the {@link Protocol}Layer cannot be loaded (required to fetch - * the {@link Content} for the RSS file). - * @throws ParseException - * If the {@link Parser}Layer cannot be loaded. - */ - @Test - public void testParseFetchChannel() throws ProtocolNotFound, ParseException { - String urlString; - Protocol protocol; - Content content; - ParseResult parseResult; - - Configuration conf = NutchConfiguration.create(); - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - urlString = urlString.replace('\\', '/'); - - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - - parseResult = new ParseUtil(conf).parseByExtensionId("feed", content); - - Assert.assertEquals(3, parseResult.size()); - - boolean hasLink1 = false, hasLink2 = false, hasLink3 = false; - - for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j - .hasNext();) { - Map.Entry<Text, Parse> entry = j.next(); - if (entry.getKey().toString() - .equals("http://www-scf.usc.edu/~mattmann/")) { - hasLink1 = true; - } else if (entry.getKey().toString().equals("http://www.nutch.org/")) { - hasLink2 = true; - } else if (entry.getKey().toString().equals(urlString)) { - hasLink3 = true; - } - - Assert.assertNotNull(entry.getValue()); - Assert.assertNotNull(entry.getValue().getData()); - } - - if (!hasLink1 || !hasLink2 || !hasLink3) { - Assert.fail("Outlinks read from sample rss file are not correct!"); - } - } - - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java b/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java new file mode 100644 index 0000000..08a42f3 --- /dev/null +++ b/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.anchor; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlink; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +/** + * JUnit test case which tests 1. that anchor text is obtained 2. that anchor + * deduplication functionality is working + * + * @author lewismc + * + */ +public class TestAnchorIndexingFilter { + + @Test + public void testDeduplicateAnchor() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.setBoolean("anchorIndexingFilter.deduplicate", true); + AnchorIndexingFilter filter = new AnchorIndexingFilter(); + filter.setConf(conf); + Assert.assertNotNull(filter); + NutchDocument doc = new NutchDocument(); + ParseImpl parse = new ParseImpl("foo bar", new ParseData()); + Inlinks inlinks = new Inlinks(); + inlinks.add(new Inlink("http://test1.com/", "text1")); + inlinks.add(new Inlink("http://test2.com/", "text2")); + inlinks.add(new Inlink("http://test3.com/", "text2")); + try { + filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), + new CrawlDatum(), inlinks); + } catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.getMessage()); + } + Assert.assertNotNull(doc); + Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames() + .contains("anchor")); + Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor") + .getValues().size()); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java b/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java deleted file mode 100644 index 08a42f3..0000000 --- a/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.indexer.anchor; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlink; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -/** - * JUnit test case which tests 1. that anchor text is obtained 2. that anchor - * deduplication functionality is working - * - * @author lewismc - * - */ -public class TestAnchorIndexingFilter { - - @Test - public void testDeduplicateAnchor() throws Exception { - Configuration conf = NutchConfiguration.create(); - conf.setBoolean("anchorIndexingFilter.deduplicate", true); - AnchorIndexingFilter filter = new AnchorIndexingFilter(); - filter.setConf(conf); - Assert.assertNotNull(filter); - NutchDocument doc = new NutchDocument(); - ParseImpl parse = new ParseImpl("foo bar", new ParseData()); - Inlinks inlinks = new Inlinks(); - inlinks.add(new Inlink("http://test1.com/", "text1")); - inlinks.add(new Inlink("http://test2.com/", "text2")); - inlinks.add(new Inlink("http://test3.com/", "text2")); - try { - filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), - new CrawlDatum(), inlinks); - } catch (Exception e) { - e.printStackTrace(); - Assert.fail(e.getMessage()); - } - Assert.assertNotNull(doc); - Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames() - .contains("anchor")); - Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor") - .getValues().size()); - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java b/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java new file mode 100644 index 0000000..4bc317e --- /dev/null +++ b/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.basic; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.basic.BasicIndexingFilter; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +import java.util.Date; + +/** + * JUnit test case which tests 1. that basic searchable fields are added to a + * document 2. that domain is added as per {@code indexer.add.domain} in + * nutch-default.xml. 3. that title is truncated as per + * {@code indexer.max.title.length} in nutch-default.xml. 4. that content is + * truncated as per {@code indexer.max.content.length} in nutch-default.xml. + * + * @author tejasp + * + */ + +public class TestBasicIndexingFilter { + + @Test + public void testBasicIndexingFilter() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.setInt("indexer.max.title.length", 10); + conf.setBoolean("indexer.add.domain", true); + conf.setInt("indexer.max.content.length", 20); + + BasicIndexingFilter filter = new BasicIndexingFilter(); + filter.setConf(conf); + Assert.assertNotNull(filter); + + NutchDocument doc = new NutchDocument(); + + String title = "The Foo Page"; + Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") }; + Metadata metaData = new Metadata(); + metaData.add("Language", "en/us"); + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, + outlinks, metaData); + ParseImpl parse = new ParseImpl( + "this is a sample foo bar page. hope you enjoy it.", parseData); + + CrawlDatum crawlDatum = new CrawlDatum(); + crawlDatum.setFetchTime(100L); + + Inlinks inlinks = new Inlinks(); + + try { + filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), + crawlDatum, inlinks); + } catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.getMessage()); + } + Assert.assertNotNull(doc); + Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc + .getField("title").getValues().get(0)); + Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc + .getField("domain").getValues().get(0)); + Assert.assertEquals("test host, expect \"nutch.apache.org\"", + "nutch.apache.org", doc.getField("host").getValues().get(0)); + Assert.assertEquals( + "test url, expect \"http://nutch.apache.org/index.html\"", + "http://nutch.apache.org/index.html", doc.getField("url").getValues() + .get(0)); + Assert.assertEquals("test content", "this is a sample foo", + doc.getField("content").getValues().get(0)); + Assert.assertEquals("test fetch time", new Date(100L), + (Date) doc.getField("tstamp").getValues().get(0)); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java b/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java deleted file mode 100644 index 4bc317e..0000000 --- a/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.indexer.basic; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.indexer.basic.BasicIndexingFilter; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -import java.util.Date; - -/** - * JUnit test case which tests 1. that basic searchable fields are added to a - * document 2. that domain is added as per {@code indexer.add.domain} in - * nutch-default.xml. 3. that title is truncated as per - * {@code indexer.max.title.length} in nutch-default.xml. 4. that content is - * truncated as per {@code indexer.max.content.length} in nutch-default.xml. - * - * @author tejasp - * - */ - -public class TestBasicIndexingFilter { - - @Test - public void testBasicIndexingFilter() throws Exception { - Configuration conf = NutchConfiguration.create(); - conf.setInt("indexer.max.title.length", 10); - conf.setBoolean("indexer.add.domain", true); - conf.setInt("indexer.max.content.length", 20); - - BasicIndexingFilter filter = new BasicIndexingFilter(); - filter.setConf(conf); - Assert.assertNotNull(filter); - - NutchDocument doc = new NutchDocument(); - - String title = "The Foo Page"; - Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") }; - Metadata metaData = new Metadata(); - metaData.add("Language", "en/us"); - ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, - outlinks, metaData); - ParseImpl parse = new ParseImpl( - "this is a sample foo bar page. hope you enjoy it.", parseData); - - CrawlDatum crawlDatum = new CrawlDatum(); - crawlDatum.setFetchTime(100L); - - Inlinks inlinks = new Inlinks(); - - try { - filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), - crawlDatum, inlinks); - } catch (Exception e) { - e.printStackTrace(); - Assert.fail(e.getMessage()); - } - Assert.assertNotNull(doc); - Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc - .getField("title").getValues().get(0)); - Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc - .getField("domain").getValues().get(0)); - Assert.assertEquals("test host, expect \"nutch.apache.org\"", - "nutch.apache.org", doc.getField("host").getValues().get(0)); - Assert.assertEquals( - "test url, expect \"http://nutch.apache.org/index.html\"", - "http://nutch.apache.org/index.html", doc.getField("url").getValues() - .get(0)); - Assert.assertEquals("test content", "this is a sample foo", - doc.getField("content").getValues().get(0)); - Assert.assertEquals("test fetch time", new Date(100L), - (Date) doc.getField("tstamp").getValues().get(0)); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java b/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java new file mode 100644 index 0000000..c490d1f --- /dev/null +++ b/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java @@ -0,0 +1,218 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.links; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlink; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.NutchField; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.util.NutchConfiguration; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.net.URL; +import java.util.Iterator; + +public class TestLinksIndexingFilter { + + Configuration conf = NutchConfiguration.create(); + LinksIndexingFilter filter = new LinksIndexingFilter(); + Metadata metadata = new Metadata(); + + @Before + public void setUp() throws Exception { + metadata.add(Response.CONTENT_TYPE, "text/html"); + } + + private Outlink[] generateOutlinks() throws Exception { + return generateOutlinks(false); + } + + private Outlink[] generateOutlinks(boolean parts) throws Exception { + Outlink[] outlinks = new Outlink[2]; + + outlinks[0] = new Outlink("http://www.test.com", "test"); + outlinks[1] = new Outlink("http://www.example.com", "example"); + + if (parts) { + outlinks[0] = new Outlink(outlinks[0].getToUrl() + "/index.php?param=1", + "test"); + outlinks[1] = new Outlink(outlinks[1].getToUrl() + "/index.php?param=2", + "test"); + } + + return outlinks; + } + + @Test + public void testFilterOutlinks() throws Exception { + conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true"); + filter.setConf(conf); + + Outlink[] outlinks = generateOutlinks(); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", outlinks, metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + Assert.assertEquals(1, doc.getField("outlinks").getValues().size()); + + Assert.assertEquals("Filter outlinks, allow only those from a different host", + outlinks[0].getToUrl(), doc.getFieldValue("outlinks")); + } + + @Test + public void testFilterInlinks() throws Exception { + conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true"); + filter.setConf(conf); + + Inlinks inlinks = new Inlinks(); + inlinks.add(new Inlink("http://www.test.com", "test")); + inlinks.add(new Inlink("http://www.example.com", "example")); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), inlinks); + + Assert.assertEquals(1, doc.getField("inlinks").getValues().size()); + + Assert.assertEquals("Filter inlinks, allow only those from a different host", + "http://www.test.com", doc.getFieldValue("inlinks")); + } + + @Test + public void testNoFilterOutlinks() throws Exception { + filter.setConf(conf); + + Outlink[] outlinks = generateOutlinks(); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", outlinks, metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + Assert.assertEquals("All outlinks must be indexed even those from the same host", + outlinks.length, doc.getField("outlinks").getValues().size()); + } + + @Test + public void testNoFilterInlinks() throws Exception { + conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false"); + filter.setConf(conf); + + Inlinks inlinks = new Inlinks(); + inlinks.add(new Inlink("http://www.test.com", "test")); + inlinks.add(new Inlink("http://www.example.com", "example")); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), inlinks); + + Assert.assertEquals("All inlinks must be indexed even those from the same host", + inlinks.size(), doc.getField("inlinks").getValues().size()); + } + + @Test + public void testIndexOnlyHostPart() throws Exception { + conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true"); + conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true"); + conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true"); + filter.setConf(conf); + + Outlink[] outlinks = generateOutlinks(true); + + Inlinks inlinks = new Inlinks(); + inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test")); + inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test")); + inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example", + "example")); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", outlinks, metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), inlinks); + + NutchField docOutlinks = doc.getField("outlinks"); + + Assert.assertEquals("Only the host portion of the outlink URL must be indexed", + new URL("http://www.test.com").getHost(), + docOutlinks.getValues().get(0)); + + Assert.assertEquals( + "The inlinks coming from the same host must count only once", 1, + doc.getField("inlinks").getValues().size()); + + Assert.assertEquals("Only the host portion of the inlinks URL must be indexed", + new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks")); + } + + @Test + public void testIndexHostsOnlyAndFilterOutlinks() throws Exception { + conf = NutchConfiguration.create(); + conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true"); + conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true"); + + Outlink[] outlinks = generateOutlinks(true); + + filter.setConf(conf); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", outlinks, metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + Assert.assertEquals(1, doc.getField("outlinks").getValues().size()); + + Assert.assertEquals( + "Index only the host portion of the outlinks after filtering", + new URL("http://www.test.com").getHost(), + doc.getFieldValue("outlinks")); + } + + @Test + public void testIndexHostsOnlyAndFilterInlinks() throws Exception { + conf = NutchConfiguration.create(); + conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true"); + conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true"); + + filter.setConf(conf); + + Inlinks inlinks = new Inlinks(); + inlinks.add(new Inlink("http://www.test.com", "test")); + inlinks.add(new Inlink("http://www.example.com", "example")); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), inlinks); + + Assert.assertEquals(1, doc.getField("inlinks").getValues().size()); + + Assert.assertEquals( + "Index only the host portion of the inlinks after filtering", + new URL("http://www.test.com").getHost(), + doc.getFieldValue("inlinks")); + + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java b/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java new file mode 100644 index 0000000..aaaedbf --- /dev/null +++ b/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java @@ -0,0 +1,54 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import org.junit.Test; + +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.*; + +public class TestOutlinks { + + @Test + public void testAddSameObject() throws Exception { + Set<Outlink> set = new HashSet<>(); + + Outlink o = new Outlink("http://www.example.com", "Example"); + set.add(o); + set.add(o); + + assertEquals("Adding the same Outlink twice", 1, set.size()); + } + + @Test + public void testAddOtherObjectWithSameData() throws Exception { + Set<Outlink> set = new HashSet<>(); + + Outlink o = new Outlink("http://www.example.com", "Example"); + Outlink o1 = new Outlink("http://www.example.com", "Example"); + + assertTrue("The two Outlink objects are the same", o.equals(o1)); + + set.add(o); + set.add(o1); + + assertEquals("The set should contain only 1 Outlink", 1, set.size()); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java b/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java deleted file mode 100644 index c490d1f..0000000 --- a/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java +++ /dev/null @@ -1,218 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.indexer.links; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlink; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.indexer.NutchField; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.util.NutchConfiguration; - -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.net.URL; -import java.util.Iterator; - -public class TestLinksIndexingFilter { - - Configuration conf = NutchConfiguration.create(); - LinksIndexingFilter filter = new LinksIndexingFilter(); - Metadata metadata = new Metadata(); - - @Before - public void setUp() throws Exception { - metadata.add(Response.CONTENT_TYPE, "text/html"); - } - - private Outlink[] generateOutlinks() throws Exception { - return generateOutlinks(false); - } - - private Outlink[] generateOutlinks(boolean parts) throws Exception { - Outlink[] outlinks = new Outlink[2]; - - outlinks[0] = new Outlink("http://www.test.com", "test"); - outlinks[1] = new Outlink("http://www.example.com", "example"); - - if (parts) { - outlinks[0] = new Outlink(outlinks[0].getToUrl() + "/index.php?param=1", - "test"); - outlinks[1] = new Outlink(outlinks[1].getToUrl() + "/index.php?param=2", - "test"); - } - - return outlinks; - } - - @Test - public void testFilterOutlinks() throws Exception { - conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true"); - filter.setConf(conf); - - Outlink[] outlinks = generateOutlinks(); - - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", - new ParseData(new ParseStatus(), "title", outlinks, metadata)), - new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); - - Assert.assertEquals(1, doc.getField("outlinks").getValues().size()); - - Assert.assertEquals("Filter outlinks, allow only those from a different host", - outlinks[0].getToUrl(), doc.getFieldValue("outlinks")); - } - - @Test - public void testFilterInlinks() throws Exception { - conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true"); - filter.setConf(conf); - - Inlinks inlinks = new Inlinks(); - inlinks.add(new Inlink("http://www.test.com", "test")); - inlinks.add(new Inlink("http://www.example.com", "example")); - - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", - new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), - new Text("http://www.example.com/"), new CrawlDatum(), inlinks); - - Assert.assertEquals(1, doc.getField("inlinks").getValues().size()); - - Assert.assertEquals("Filter inlinks, allow only those from a different host", - "http://www.test.com", doc.getFieldValue("inlinks")); - } - - @Test - public void testNoFilterOutlinks() throws Exception { - filter.setConf(conf); - - Outlink[] outlinks = generateOutlinks(); - - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", - new ParseData(new ParseStatus(), "title", outlinks, metadata)), - new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); - - Assert.assertEquals("All outlinks must be indexed even those from the same host", - outlinks.length, doc.getField("outlinks").getValues().size()); - } - - @Test - public void testNoFilterInlinks() throws Exception { - conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false"); - filter.setConf(conf); - - Inlinks inlinks = new Inlinks(); - inlinks.add(new Inlink("http://www.test.com", "test")); - inlinks.add(new Inlink("http://www.example.com", "example")); - - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", - new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), - new Text("http://www.example.com/"), new CrawlDatum(), inlinks); - - Assert.assertEquals("All inlinks must be indexed even those from the same host", - inlinks.size(), doc.getField("inlinks").getValues().size()); - } - - @Test - public void testIndexOnlyHostPart() throws Exception { - conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true"); - conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true"); - conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true"); - filter.setConf(conf); - - Outlink[] outlinks = generateOutlinks(true); - - Inlinks inlinks = new Inlinks(); - inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test")); - inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test")); - inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example", - "example")); - - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", - new ParseData(new ParseStatus(), "title", outlinks, metadata)), - new Text("http://www.example.com/"), new CrawlDatum(), inlinks); - - NutchField docOutlinks = doc.getField("outlinks"); - - Assert.assertEquals("Only the host portion of the outlink URL must be indexed", - new URL("http://www.test.com").getHost(), - docOutlinks.getValues().get(0)); - - Assert.assertEquals( - "The inlinks coming from the same host must count only once", 1, - doc.getField("inlinks").getValues().size()); - - Assert.assertEquals("Only the host portion of the inlinks URL must be indexed", - new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks")); - } - - @Test - public void testIndexHostsOnlyAndFilterOutlinks() throws Exception { - conf = NutchConfiguration.create(); - conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true"); - conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true"); - - Outlink[] outlinks = generateOutlinks(true); - - filter.setConf(conf); - - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", - new ParseData(new ParseStatus(), "title", outlinks, metadata)), - new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); - - Assert.assertEquals(1, doc.getField("outlinks").getValues().size()); - - Assert.assertEquals( - "Index only the host portion of the outlinks after filtering", - new URL("http://www.test.com").getHost(), - doc.getFieldValue("outlinks")); - } - - @Test - public void testIndexHostsOnlyAndFilterInlinks() throws Exception { - conf = NutchConfiguration.create(); - conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true"); - conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true"); - - filter.setConf(conf); - - Inlinks inlinks = new Inlinks(); - inlinks.add(new Inlink("http://www.test.com", "test")); - inlinks.add(new Inlink("http://www.example.com", "example")); - - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", - new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), - new Text("http://www.example.com/"), new CrawlDatum(), inlinks); - - Assert.assertEquals(1, doc.getField("inlinks").getValues().size()); - - Assert.assertEquals( - "Index only the host portion of the inlinks after filtering", - new URL("http://www.test.com").getHost(), - doc.getFieldValue("inlinks")); - - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java b/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java deleted file mode 100644 index aaaedbf..0000000 --- a/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java +++ /dev/null @@ -1,54 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse; - -import org.junit.Test; - -import java.util.HashSet; -import java.util.Set; - -import static org.junit.Assert.*; - -public class TestOutlinks { - - @Test - public void testAddSameObject() throws Exception { - Set<Outlink> set = new HashSet<>(); - - Outlink o = new Outlink("http://www.example.com", "Example"); - set.add(o); - set.add(o); - - assertEquals("Adding the same Outlink twice", 1, set.size()); - } - - @Test - public void testAddOtherObjectWithSameData() throws Exception { - Set<Outlink> set = new HashSet<>(); - - Outlink o = new Outlink("http://www.example.com", "Example"); - Outlink o1 = new Outlink("http://www.example.com", "Example"); - - assertTrue("The two Outlink objects are the same", o.equals(o1)); - - set.add(o); - set.add(o1); - - assertEquals("The set should contain only 1 Outlink", 1, set.size()); - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java b/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java new file mode 100644 index 0000000..f918dde --- /dev/null +++ b/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.more; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +public class TestMoreIndexingFilter { + + @Test + public void testContentType() throws IndexingException { + Configuration conf = NutchConfiguration.create(); + assertContentType(conf, "text/html", "text/html"); + assertContentType(conf, "text/html; charset=UTF-8", "text/html"); + } + + @Test + public void testGetParts() { + String[] parts = MoreIndexingFilter.getParts("text/html"); + assertParts(parts, 2, "text", "html"); + } + + /** + * @since NUTCH-901 + */ + @Test + public void testNoParts() { + Configuration conf = NutchConfiguration.create(); + conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false); + MoreIndexingFilter filter = new MoreIndexingFilter(); + filter.setConf(conf); + Assert.assertNotNull(filter); + NutchDocument doc = new NutchDocument(); + ParseImpl parse = new ParseImpl("foo bar", new ParseData()); + + try { + filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), + new CrawlDatum(), new Inlinks()); + } catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.getMessage()); + } + Assert.assertNotNull(doc); + Assert.assertTrue(doc.getFieldNames().contains("type")); + Assert.assertEquals(1, doc.getField("type").getValues().size()); + Assert.assertEquals("text/html", doc.getFieldValue("type")); + } + + @Test + public void testContentDispositionTitle() throws IndexingException { + Configuration conf = NutchConfiguration.create(); + + Metadata metadata = new Metadata(); + metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext"); + MoreIndexingFilter filter = new MoreIndexingFilter(); + filter.setConf(conf); + + Text url = new Text("http://www.example.com/"); + ParseImpl parseImpl = new ParseImpl("text", new ParseData( + new ParseStatus(), "title", new Outlink[0], metadata)); + + NutchDocument doc = new NutchDocument(); + doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks()); + + Assert.assertEquals("content-disposition not detected", "filename.ext", + doc.getFieldValue("title")); + + /* NUTCH-1140: do not add second title to avoid a multi-valued title field */ + doc = new NutchDocument(); + doc.add("title", "title"); + doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks()); + Assert.assertEquals("do not add second title by content-disposition", + "title", doc.getFieldValue("title")); + } + + private void assertParts(String[] parts, int count, String... expected) { + Assert.assertEquals(count, parts.length); + for (int i = 0; i < expected.length; i++) { + Assert.assertEquals(expected[i], parts[i]); + } + } + + private void assertContentType(Configuration conf, String source, + String expected) throws IndexingException { + Metadata metadata = new Metadata(); + metadata.add(Response.CONTENT_TYPE, source); + MoreIndexingFilter filter = new MoreIndexingFilter(); + filter.setConf(conf); + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl( + "text", new ParseData(new ParseStatus(), "title", new Outlink[0], + metadata)), new Text("http://www.example.com/"), new CrawlDatum(), + new Inlinks()); + Assert.assertEquals("mime type not detected", expected, + doc.getFieldValue("type")); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java b/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java deleted file mode 100644 index f918dde..0000000 --- a/nutch-plugins/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.indexer.more; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -public class TestMoreIndexingFilter { - - @Test - public void testContentType() throws IndexingException { - Configuration conf = NutchConfiguration.create(); - assertContentType(conf, "text/html", "text/html"); - assertContentType(conf, "text/html; charset=UTF-8", "text/html"); - } - - @Test - public void testGetParts() { - String[] parts = MoreIndexingFilter.getParts("text/html"); - assertParts(parts, 2, "text", "html"); - } - - /** - * @since NUTCH-901 - */ - @Test - public void testNoParts() { - Configuration conf = NutchConfiguration.create(); - conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false); - MoreIndexingFilter filter = new MoreIndexingFilter(); - filter.setConf(conf); - Assert.assertNotNull(filter); - NutchDocument doc = new NutchDocument(); - ParseImpl parse = new ParseImpl("foo bar", new ParseData()); - - try { - filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), - new CrawlDatum(), new Inlinks()); - } catch (Exception e) { - e.printStackTrace(); - Assert.fail(e.getMessage()); - } - Assert.assertNotNull(doc); - Assert.assertTrue(doc.getFieldNames().contains("type")); - Assert.assertEquals(1, doc.getField("type").getValues().size()); - Assert.assertEquals("text/html", doc.getFieldValue("type")); - } - - @Test - public void testContentDispositionTitle() throws IndexingException { - Configuration conf = NutchConfiguration.create(); - - Metadata metadata = new Metadata(); - metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext"); - MoreIndexingFilter filter = new MoreIndexingFilter(); - filter.setConf(conf); - - Text url = new Text("http://www.example.com/"); - ParseImpl parseImpl = new ParseImpl("text", new ParseData( - new ParseStatus(), "title", new Outlink[0], metadata)); - - NutchDocument doc = new NutchDocument(); - doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks()); - - Assert.assertEquals("content-disposition not detected", "filename.ext", - doc.getFieldValue("title")); - - /* NUTCH-1140: do not add second title to avoid a multi-valued title field */ - doc = new NutchDocument(); - doc.add("title", "title"); - doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks()); - Assert.assertEquals("do not add second title by content-disposition", - "title", doc.getFieldValue("title")); - } - - private void assertParts(String[] parts, int count, String... expected) { - Assert.assertEquals(count, parts.length); - for (int i = 0; i < expected.length; i++) { - Assert.assertEquals(expected[i], parts[i]); - } - } - - private void assertContentType(Configuration conf, String source, - String expected) throws IndexingException { - Metadata metadata = new Metadata(); - metadata.add(Response.CONTENT_TYPE, source); - MoreIndexingFilter filter = new MoreIndexingFilter(); - filter.setConf(conf); - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl( - "text", new ParseData(new ParseStatus(), "title", new Outlink[0], - metadata)), new Text("http://www.example.com/"), new CrawlDatum(), - new Inlinks()); - Assert.assertEquals("mime type not detected", expected, - doc.getFieldValue("type")); - } -}
