Author: jorgelbg
Date: Thu Oct 22 15:26:34 2015
New Revision: 1710033
URL: http://svn.apache.org/viewvc?rev=1710033&view=rev
Log:
NUTCH-2139 Basic plugin to index inlinks and outlinks
Added:
nutch/trunk/src/plugin/index-links/
nutch/trunk/src/plugin/index-links/build.xml
nutch/trunk/src/plugin/index-links/ivy.xml
nutch/trunk/src/plugin/index-links/plugin.xml
nutch/trunk/src/plugin/index-links/src/
nutch/trunk/src/plugin/index-links/src/java/
nutch/trunk/src/plugin/index-links/src/java/org/
nutch/trunk/src/plugin/index-links/src/java/org/apache/
nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/
nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/
nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/
nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
nutch/trunk/src/plugin/index-links/src/test/
nutch/trunk/src/plugin/index-links/src/test/org/
nutch/trunk/src/plugin/index-links/src/test/org/apache/
nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/
nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/
nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/
nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/build.xml
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/build.xml
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1710033&r1=1710032&r2=1710033&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Oct 22 15:26:34 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2139 Basic plugin to index inlinks and outlinks (jorgelbg)
+
* NUTCH-2128 Review and update mapred --> mapreduce config params in crawl
script (lewismc)
* NUTCH-2141 Change the InteractiveSelenium plugin handler Interface to return
page content
Modified: nutch/trunk/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1710033&r1=1710032&r2=1710033&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Thu Oct 22 15:26:34 2015
@@ -178,6 +178,7 @@
<packageset dir="${plugins.dir}/index-geoip/src/java"/>
<packageset dir="${plugins.dir}/index-replace/src/java"/>
<packageset dir="${plugins.dir}/index-static/src/java"/>
+ <packageset dir="${plugins.dir}/index-links/src/java"/>
<packageset dir="${plugins.dir}/mimetype-filter/src/java"/>
<packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
<packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
@@ -589,6 +590,7 @@
<packageset dir="${plugins.dir}/index-more/src/java"/>
<packageset dir="${plugins.dir}/index-replace/src/java"/>
<packageset dir="${plugins.dir}/index-static/src/java"/>
+ <packageset dir="${plugins.dir}/index-links/src/java"/>
<packageset dir="${plugins.dir}/mimetype-filter/src/java"/>
<packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
<packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
@@ -979,6 +981,8 @@
<source path="${plugins.dir}/index-basic/src/java/" />
<source path="${plugins.dir}/index-basic/src/test/" />
<source path="${plugins.dir}/index-geoip/src/java/" />
+ <source path="${plugins.dir}/index-links/src/java/" />
+ <source path="${plugins.dir}/index-links/src/test/" />
<source path="${plugins.dir}/mimetype-filter/src/java/" />
<source path="${plugins.dir}/mimetype-filter/src/test/" />
<source path="${plugins.dir}/indexer-dummy/src/java/" />
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1710033&r1=1710032&r2=1710033&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Thu Oct 22 15:26:34 2015
@@ -1896,4 +1896,37 @@ CAUTION: Set the parser.timeout to -1 or
</description>
</property>
+<!-- index-links plugin -->
+
+<property>
+ <name>index.links.outlinks.host.ignore</name>
+ <value>false</value>
+ <description>
+ Ignore outlinks that point out to the same host as the URL being indexed.
+ By default all outlinks are indexed. If db.ignore.internal.links is true
(default
+ value), this setting does nothing since the internal links are already
+ ignored.
+ </description>
+</property>
+
+<property>
+ <name>index.links.inlinks.host.ignore</name>
+ <value>false</value>
+ <description>
+ Ignore inlinks coming from the same host as the URL being indexed. By
default
+ all inlinks are indexed. If db.ignore.internal.links is true (default
+ value), this setting does nothing since the internal links are already
+ ignored.
+ </description>
+</property>
+
+<property>
+ <name>index.links.hosts.only</name>
+ <value>false</value>
+ <description>
+ This force the index-links plugin to only index the host portion of the
inlinks
+ or outlinks.
+ </description>
+</property>
+
</configuration>
Modified: nutch/trunk/src/plugin/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1710033&r1=1710032&r2=1710033&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Thu Oct 22 15:26:34 2015
@@ -36,6 +36,7 @@
<ant dir="index-replace" target="deploy"/>
<ant dir="index-static" target="deploy"/>
<ant dir="index-metadata" target="deploy"/>
+ <ant dir="index-links" target="deploy"/>
<ant dir="mimetype-filter" target="deploy"/>
<ant dir="indexer-cloudsearch" target="deploy"/>
<ant dir="indexer-dummy" target="deploy"/>
@@ -98,6 +99,7 @@
<ant dir="index-more" target="test"/>
<ant dir="index-static" target="test"/>
<ant dir="index-replace" target="test"/>
+ <ant dir="index-links" target="test"/>
<ant dir="mimetype-filter" target="test"/>
<ant dir="language-identifier" target="test"/>
<ant dir="lib-http" target="test"/>
@@ -143,6 +145,7 @@
<ant dir="index-static" target="clean"/>
<ant dir="index-replace" target="clean"/>
<ant dir="index-metadata" target="clean"/>
+ <ant dir="index-links" target="clean"/>
<ant dir="mimetype-filter" target="clean"/>
<ant dir="indexer-cloudsearch" target="clean"/>
<ant dir="indexer-dummy" target="clean"/>
Added: nutch/trunk/src/plugin/index-links/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-links/build.xml?rev=1710033&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-links/build.xml (added)
+++ nutch/trunk/src/plugin/index-links/build.xml Thu Oct 22 15:26:34 2015
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-links" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
Added: nutch/trunk/src/plugin/index-links/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-links/ivy.xml?rev=1710033&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-links/ivy.xml (added)
+++ nutch/trunk/src/plugin/index-links/ivy.xml Thu Oct 22 15:26:34 2015
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
Added: nutch/trunk/src/plugin/index-links/plugin.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-links/plugin.xml?rev=1710033&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-links/plugin.xml (added)
+++ nutch/trunk/src/plugin/index-links/plugin.xml Thu Oct 22 15:26:34 2015
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="index-links"
+ name="Index inlinks and outlinks"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="index-links.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.indexer.links.LinksIndexingFilter"
+ name="Links indexing filter"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="org.apache.nutch.indexer.links.LinksIndexingFilter"
+
class="org.apache.nutch.indexer.links.LinksIndexingFilter"/>
+ </extension>
+
+</plugin>
Added:
nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java?rev=1710033&view=auto
==============================================================================
---
nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
(added)
+++
nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
Thu Oct 22 15:26:34 2015
@@ -0,0 +1,167 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.links;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.slf4j.LoggerFactory;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that adds
+ * <code>outlinks</code> and <code>inlinks</code> field(s) to the document.
+ *
+ * In case that you want to ignore the outlinks that point to the same host
+ * as the URL being indexed use the following settings in your configuration
+ * file:
+ *
+ * <property>
+ * <name>outlinks.host.ignore</name>
+ * <value>true</value>
+ * </property>
+ *
+ * The same configuration is available for inlinks:
+ *
+ * <property>
+ * <name>inlinks.host.ignore</name>
+ * <value>true</value>
+ * </property>
+ *
+ * To store only the host portion of each inlink URL or outlink URL add the
+ * following to your configuration file.
+ *
+ * <property>
+ * <name>links.hosts.only</name>
+ * <value>false</value>
+ * </property>
+ *
+ */
+public class LinksIndexingFilter implements IndexingFilter {
+
+ public final static String LINKS_OUTLINKS_HOST =
"index.links.outlinks.host.ignore";
+ public final static String LINKS_INLINKS_HOST =
"index.links.inlinks.host.ignore";
+ public final static String LINKS_ONLY_HOSTS = "index.links.hosts.only";
+
+ public final static org.slf4j.Logger LOG = LoggerFactory
+ .getLogger(LinksIndexingFilter.class);
+
+ private Configuration conf;
+ private boolean filterOutlinks;
+ private boolean filterInlinks;
+ private boolean indexHost;
+
+ @Override
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+ // Add the outlinks
+ Outlink[] outlinks = parse.getData().getOutlinks();
+
+ if (outlinks != null) {
+ Set<String> hosts = new HashSet<String>();
+
+ for (Outlink outlink : outlinks) {
+ try {
+ String linkUrl = outlink.getToUrl();
+ String outHost = new URL(linkUrl).getHost().toLowerCase();
+
+ if (indexHost) {
+ linkUrl = outHost;
+
+ if (hosts.contains(linkUrl))
+ continue;
+
+ hosts.add(linkUrl);
+ }
+
+ addFilteredLink("outlinks", url.toString(), linkUrl, outHost,
+ filterOutlinks, doc);
+ } catch (MalformedURLException e) {
+ LOG.error("Malformed URL in {}: {}", url, e.getMessage());
+ }
+ }
+ }
+
+ // Add the inlinks
+ if (null != inlinks) {
+ Iterator<Inlink> iterator = inlinks.iterator();
+ Set<String> inlinkHosts = new HashSet<String>();
+
+ while (iterator.hasNext()) {
+ try {
+ Inlink link = iterator.next();
+ String linkUrl = link.getFromUrl();
+ String inHost = new URL(linkUrl).getHost().toLowerCase();
+
+ if (indexHost) {
+ linkUrl = inHost;
+
+ if (inlinkHosts.contains(linkUrl))
+ continue;
+
+ inlinkHosts.add(linkUrl);
+ }
+
+ addFilteredLink("inlinks", url.toString(), linkUrl, inHost,
+ filterInlinks, doc);
+ } catch (MalformedURLException e) {
+ LOG.error("Malformed URL in {}: {}", url, e.getMessage());
+ }
+ }
+ }
+
+ return doc;
+ }
+
+ private void addFilteredLink(String fieldName, String url, String linkUrl,
+ String urlHost, boolean filter, NutchDocument doc) throws
MalformedURLException {
+ if (filter) {
+ String host = new URL(url.toString()).getHost().toLowerCase();
+
+ if (!host.equalsIgnoreCase(urlHost)) {
+ doc.add(fieldName, linkUrl);
+ }
+ } else {
+ doc.add(fieldName, linkUrl);
+ }
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ filterOutlinks = conf.getBoolean(LINKS_OUTLINKS_HOST, false);
+ filterInlinks = conf.getBoolean(LINKS_INLINKS_HOST, false);
+
+ indexHost = conf.getBoolean(LINKS_ONLY_HOSTS, false);
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+}
Added:
nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java?rev=1710033&view=auto
==============================================================================
---
nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
(added)
+++
nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
Thu Oct 22 15:26:34 2015
@@ -0,0 +1,218 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.links;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.net.URL;
+import java.util.Iterator;
+
+public class TestLinksIndexingFilter {
+
+ Configuration conf = NutchConfiguration.create();
+ LinksIndexingFilter filter = new LinksIndexingFilter();
+ Metadata metadata = new Metadata();
+
+ @Before
+ public void setUp() throws Exception {
+ metadata.add(Response.CONTENT_TYPE, "text/html");
+ }
+
+ private Outlink[] generateOutlinks() throws Exception {
+ return generateOutlinks(false);
+ }
+
+ private Outlink[] generateOutlinks(boolean parts) throws Exception {
+ Outlink[] outlinks = new Outlink[2];
+
+ outlinks[0] = new Outlink("http://www.test.com", "test");
+ outlinks[1] = new Outlink("http://www.example.com", "example");
+
+ if (parts) {
+ outlinks[0] = new Outlink(outlinks[0].getToUrl() + "/index.php?param=1",
+ "test");
+ outlinks[1] = new Outlink(outlinks[1].getToUrl() + "/index.php?param=2",
+ "test");
+ }
+
+ return outlinks;
+ }
+
+ @Test
+ public void testFilterOutlinks() throws Exception {
+ conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+ filter.setConf(conf);
+
+ Outlink[] outlinks = generateOutlinks();
+
+ NutchDocument doc = filter.filter(new NutchDocument(), new
ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
+
+ Assert.assertEquals("Filter outlinks, allow only those from a different
host",
+ outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
+ }
+
+ @Test
+ public void testFilterInlinks() throws Exception {
+ conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+ filter.setConf(conf);
+
+ Inlinks inlinks = new Inlinks();
+ inlinks.add(new Inlink("http://www.test.com", "test"));
+ inlinks.add(new Inlink("http://www.example.com", "example"));
+
+ NutchDocument doc = filter.filter(new NutchDocument(), new
ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", new Outlink[0],
metadata)),
+ new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+ Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
+
+ Assert.assertEquals("Filter inlinks, allow only those from a different
host",
+ "http://www.test.com", doc.getFieldValue("inlinks"));
+ }
+
+ @Test
+ public void testNoFilterOutlinks() throws Exception {
+ filter.setConf(conf);
+
+ Outlink[] outlinks = generateOutlinks();
+
+ NutchDocument doc = filter.filter(new NutchDocument(), new
ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ Assert.assertEquals("All outlinks must be indexed even those from the same
host",
+ outlinks.length, doc.getField("outlinks").getValues().size());
+ }
+
+ @Test
+ public void testNoFilterInlinks() throws Exception {
+ conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
+ filter.setConf(conf);
+
+ Inlinks inlinks = new Inlinks();
+ inlinks.add(new Inlink("http://www.test.com", "test"));
+ inlinks.add(new Inlink("http://www.example.com", "example"));
+
+ NutchDocument doc = filter.filter(new NutchDocument(), new
ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", new Outlink[0],
metadata)),
+ new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+ Assert.assertEquals("All inlinks must be indexed even those from the same
host",
+ inlinks.size(), doc.getField("inlinks").getValues().size());
+ }
+
+ @Test
+ public void testIndexOnlyHostPart() throws Exception {
+ conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+ conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+ conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+ filter.setConf(conf);
+
+ Outlink[] outlinks = generateOutlinks(true);
+
+ Inlinks inlinks = new Inlinks();
+ inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
+ inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
+ inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example",
+ "example"));
+
+ NutchDocument doc = filter.filter(new NutchDocument(), new
ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+ new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+ NutchField docOutlinks = doc.getField("outlinks");
+
+ Assert.assertEquals("Only the host portion of the outlink URL must be
indexed",
+ new URL("http://www.test.com").getHost(),
+ docOutlinks.getValues().get(0));
+
+ Assert.assertEquals(
+ "The inlinks coming from the same host must count only once", 1,
+ doc.getField("inlinks").getValues().size());
+
+ Assert.assertEquals("Only the host portion of the inlinks URL must be
indexed",
+ new URL("http://www.test.com").getHost(),
doc.getFieldValue("inlinks"));
+ }
+
+ @Test
+ public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
+ conf = NutchConfiguration.create();
+ conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+ conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+
+ Outlink[] outlinks = generateOutlinks(true);
+
+ filter.setConf(conf);
+
+ NutchDocument doc = filter.filter(new NutchDocument(), new
ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
+
+ Assert.assertEquals(
+ "Index only the host portion of the outlinks after filtering",
+ new URL("http://www.test.com").getHost(),
+ doc.getFieldValue("outlinks"));
+ }
+
+ @Test
+ public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
+ conf = NutchConfiguration.create();
+ conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+ conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+
+ filter.setConf(conf);
+
+ Inlinks inlinks = new Inlinks();
+ inlinks.add(new Inlink("http://www.test.com", "test"));
+ inlinks.add(new Inlink("http://www.example.com", "example"));
+
+ NutchDocument doc = filter.filter(new NutchDocument(), new
ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", new Outlink[0],
metadata)),
+ new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+ Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
+
+ Assert.assertEquals(
+ "Index only the host portion of the inlinks after filtering",
+ new URL("http://www.test.com").getHost(),
+ doc.getFieldValue("inlinks"));
+
+ }
+}
\ No newline at end of file