Hmm i though i committed one a long time ago, it seem i never did. Attached is support for rel-canonical as a parse filter for 1.x (should be easily ported to 2.x). I'm not up to providing a proper patch at this moment.
We have used this one for a very, very long time. Comments are inline, swear words probably included as i have probably not made it fit for ASF inclusion all that time. Regards, markus -----Original message----- > From:Ben Vachon <[email protected]> > Sent: Thursday 18th May 2017 16:12 > To: [email protected] > Subject: rel="canonical" attribute > > Hi all, > > I'm wondering how Nutch 2.3.1 handles links with the rel="canonical" > attribute. > > I found this ticket: https://issues.apache.org/jira/browse/NUTCH-710 > which is from version 1.1 and doesn't seem to have ever been resolved. > Are all canonical links still just rejected? Are there any plans to add > any of the other options Sebastian Nagel mentions in the last comment? > > Thanks, > > Ben V. > > (p.s. is there a date set for the Nutch 2.4 release?) >
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io.openindex.nutch.parse.relcanonical; import java.net.URL; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.parse.HTMLMetaTags; import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.HtmlParseFilter; import org.apache.nutch.parse.ParseResult; import org.apache.nutch.plugin.Extension; import org.apache.nutch.plugin.PluginRepository; import org.apache.nutch.protocol.Content; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NodeWalker; import org.apache.nutch.util.URLUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.*; /** * Handle rel=canonical link and HTTP header. Injects robots=noindex if it * doesn't need to be indexed in the meta data and relies on the appropriate * flag for the indexer. * * @author [email protected] */ public class RelCanonicalParseFilter implements HtmlParseFilter { private Configuration conf; private URLNormalizers urlNormalizers; private URLFilters urlFilters; public static final Logger LOG = LoggerFactory.getLogger(RelCanonicalParseFilter.class); /** * Default constructor. */ public RelCanonicalParseFilter() { } /** * Extract a date from the parsed content. * * @param Content * @param ParseResult * @param HTMLMetaTags * @param DocumentFragment * @return ParseResult */ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { // The base URL URL baseUrl = null; // Get the base URL try { baseUrl = new URL(content.getBaseUrl()); } catch (MalformedURLException e) {} // Contains the canonical if we find any String canonical = null; // Get the URL String url = content.getUrl(); // Get the parse! Parse parse = parseResult.get(url); // Ok if (parse != null) { // Get the URL from the meta link canonical = getMetaLink(doc); // Got any? if (canonical != null) { // Resolve, check and normalize and filter canonical = resolveCanonicalUrl(baseUrl, url, canonical); } // Read from HTTP header if we still haven't got any if (canonical == null) { // Is there a link header in the HTTP header? canonical = getHttpHeaderLink(parse); // Did we get any? if (canonical != null) { // Resolve, check and normalize and filter canonical = resolveCanonicalUrl(baseUrl, url, canonical); } // Read from og:url is we still haven't got any if (canonical == null) { canonical = parse.getData().getParseMeta().get("og:url"); // Did we get any? if (canonical != null) { canonical = resolveCanonicalUrl(baseUrl, url, canonical); } } } // Did we find a canonical URL? if (canonical != null) { // Check for existing robots meta String robotsMeta = parse.getData().getParseMeta().get("robots"); // Usual stuff if (robotsMeta != null) { robotsMeta = robotsMeta.toLowerCase().trim(); } // Get parts of the robots meta List<String> parts = new ArrayList<String>(); if (robotsMeta != null) { Collections.addAll(parts, robotsMeta.split(",")); } // Get rid of white space for (int i = 0; i < parts.size(); i++) { parts.set(i, parts.get(i).trim()); } // Get rid of empty stuff parts.remove(""); // Remove any existing index directive parts.remove("index"); // Add no index only if it isnt there if (!parts.contains("noindex")) { parts.add("noindex"); } // Have the indexer remove this URL from the index parse.getData().getParseMeta().set("robots", join(parts, ",")); // We'll get our outlinks here List<Outlink> outlinks = new ArrayList<Outlink>(); // Get the outlinks Collections.addAll(outlinks, parse.getData().getOutlinks()); try { // Add the canonical URL to the outlinks outlinks.add(new Outlink(canonical, "")); // Set the outlinks parse.getData().setOutlinks(outlinks.toArray(new Outlink[outlinks.size()])); } catch (MalformedURLException e) {} } } // Return the whole return parseResult; } /** * Resolves, normalizes and filters the URL and checks if it's not equal * to the current URL. * * @param String * @param String * @return String */ private String resolveCanonicalUrl(URL baseUrl, String url, String canonicalUrl) { // Resolve the URL try { canonicalUrl = URLUtil.resolveURL(baseUrl, canonicalUrl).toString(); // Do we have a canonical URL if (canonicalUrl != null) { // Pass the URL through the URL normalizers canonicalUrl = normalizeUrl(canonicalUrl); // Ok? if (canonicalUrl != null) { // ..filter canonicalUrl = filterUrl(canonicalUrl); // Do we still have a canonical URL and it's not the same as the current? if (canonicalUrl != null && !canonicalUrl.equals(url)) { // Ok... return canonicalUrl; } } } } catch (MalformedURLException e) {} // Got nothing return null; } /** * Attempts to read a canonical URL from the HTTP headers * * @return String */ private String getHttpHeaderLink(Parse parse) { // Read the link header String link = parse.getData().getContentMeta().get("Link"); // Get any? if (link != null) { // Get the parts <http://www.example.org/>; rel="canonical" String[] parts = link.split(" "); // Got two parts? if (parts.length == 2) { // Is rel canonical? if (parts[1].equalsIgnoreCase("rel=\"canonical\"")) { return parts[0].substring(1, parts[0].length() - 2); } } } // Got nothing.. return null; } /** * Finds the specified element and returns its value * * @param DocumentFragment * @return String */ private String getMetaLink(DocumentFragment doc) { // We're looking for link elements String element = new String("link"); NodeWalker walker = new NodeWalker(doc); try { while (walker.hasNext()) { Node currentNode = walker.nextNode(); if (currentNode.getNodeType() == Node.ELEMENT_NODE) { if (element.equalsIgnoreCase(currentNode.getNodeName())) { HashMap<String,String> atts = getAttributes(currentNode); // Must have href and rel if (atts.containsKey("href") && atts.containsKey("rel")) { // rel must be canonical if (atts.get("rel").equalsIgnoreCase("canonical")) { // Return the value for href return atts.get("href"); } } } } } } catch (Exception e) {} // Seems nothing is found return null; } /** * Returns a key/value map with the attributes of the given Node * * @param Node * @return HashMap<String,String> */ private HashMap<String,String> getAttributes(Node node) { HashMap<String,String> attribMap = new HashMap<String,String>(); NamedNodeMap attributes = node.getAttributes(); for(int i = 0 ; i < attributes.getLength(); i++) { Attr attribute = (Attr)attributes.item(i); attribMap.put(attribute.getName().toLowerCase(), attribute.getValue()); } return attribMap; } /** * Normalizes and trims extra whitespace from the given url. * * @param url The url to normalize. * @return The normalized url. */ private String normalizeUrl(String url) { String normalized = null; if (urlNormalizers != null) { try { // normalize and trim the url normalized = urlNormalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); normalized = normalized.trim(); } catch (Exception e) { LOG.warn("Skipping " + url + ":" + e); normalized = null; } } return normalized; } /** * Filters the given url. * * @param url The url to filter. * * @return The filtered url or null. */ private String filterUrl(String url) { try { url = urlFilters.filter(url); } catch (Exception e) { url = null; } return url; } /** * Brrp */ public void setConf(Configuration conf) { this.conf = conf; urlNormalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT); urlFilters = new URLFilters(conf); } /** * zzz */ public Configuration getConf() { return this.conf; } public static String join(List<String> parts, String glue) { StringBuilder sb = new StringBuilder(); boolean first = true; for (String part : parts) { if (first) { first = false; } else { sb.append(glue); } sb.append(part); } return sb.toString(); } }

