[
https://issues.apache.org/jira/browse/NUTCH-2139?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14959460#comment-14959460
]
ASF GitHub Bot commented on NUTCH-2139:
---------------------------------------
Github user jorgelbg commented on a diff in the pull request:
https://github.com/apache/nutch/pull/78#discussion_r42168120
--- Diff:
src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
---
@@ -0,0 +1,168 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.links;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.slf4j.LoggerFactory;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that adds
+ * <code>outlinks</code> and <code>inlinks</code> field(s) to the document.
+ *
+ * In case that you want to ignore the outlinks that point to the same host
+ * as the URL being indexed use the following settings in your
configuration
+ * file:
+ *
+ * <property>
+ * <name>outlinks.host.ignore</name>
+ * <value>true</value>
+ * </property>
+ *
+ * The same configuration is available for inlinks:
+ *
+ * <property>
+ * <name>inlinks.host.ignore</name>
+ * <value>true</value>
+ * </property>
+ *
+ * To store only the host portion of each inlink URL or outlink URL add the
+ * following to your configuration file.
+ *
+ * <property>
+ * <name>links.hosts.only</name>
+ * <value>false</value>
+ * </property>
+ *
+ */
+public class LinksIndexingFilter implements IndexingFilter {
+
+ public final static String LINKS_OUTLINKS_HOST = "outlinks.host.ignore";
+ public final static String LINKS_INLINKS_HOST = "inlinks.host.ignore";
+ public final static String LINKS_ONLY_HOSTS = "links.hosts.only";
+
+ public final static org.slf4j.Logger LOG = LoggerFactory
+ .getLogger(LinksIndexingFilter.class);
+
+ private Configuration conf;
+ private boolean filterOutlinks;
+ private boolean filterInlinks;
+ private boolean indexHost;
+
+ @Override
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+ // Add the outlinks
+ Outlink[] outlinks = parse.getData().getOutlinks();
+
+ try {
+ if (outlinks != null) {
+ Set<String> hosts = new HashSet<String>();
+
+ for (Outlink outlink : outlinks) {
+ String linkUrl = outlink.getToUrl();
+ String outHost = new URL(linkUrl).getHost();
+
+ if (indexHost) {
+ linkUrl = new URL(outlink.getToUrl()).getHost();
+
+ if (hosts.contains(linkUrl))
+ continue;
+
+ hosts.add(linkUrl);
+ }
+
+ addFilteredLink("outlinks", url.toString(), linkUrl, outHost,
+ filterOutlinks, doc);
+ }
+ }
+ } catch (MalformedURLException e) {
+ LOG.error("Malformed URL in {}: {}", url, e.getMessage());
+ }
+
+ // Add the inlinks
+ if (null != inlinks) {
+ Iterator<Inlink> iterator = inlinks.iterator();
+ Set<String> inlinkHosts = new HashSet<String>();
+
+ try {
+ while (iterator.hasNext()) {
--- End diff --
I though the same, since the URL is already is fetched shouldn't be any
trouble, but its an easy fix so I can put it inside the while loop.
> Basic plugin to index inlinks and outlinks
> ------------------------------------------
>
> Key: NUTCH-2139
> URL: https://issues.apache.org/jira/browse/NUTCH-2139
> Project: Nutch
> Issue Type: Improvement
> Components: indexer, plugin
> Reporter: Jorge Luis Betancourt Gonzalez
> Priority: Minor
> Labels: link, plugin
> Fix For: 1.11
>
>
> Basic plugin that allows to index the inlinks and outlinks of the web pages,
> this could be very useful for analytic purposes, including neat
> visualizations using d3.js for instance.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)