RE: rel="canonical" attribute

Markus Jelsma Tue, 23 May 2017 15:02:49 -0700

Hmm i though i committed one a long time ago, it seem i never did. Attached is 
support for rel-canonical as a parse filter for 1.x (should be easily ported to 
2.x). I'm not up to providing a proper patch at this moment.


We have used this one for a very, very long time. Comments are inline, swear 
words probably included as i have probably not made it fit for ASF inclusion 
all that time.

Regards,
markus

-----Original message-----
> From:Ben Vachon <[email protected]>
> Sent: Thursday 18th May 2017 16:12
> To: [email protected]
> Subject: rel=&quot;canonical&quot; attribute
> 
> Hi all,
> 
> I'm wondering how Nutch 2.3.1 handles links with the rel="canonical" 
> attribute.
> 
> I found this ticket: https://issues.apache.org/jira/browse/NUTCH-710 
> which is from version 1.1 and doesn't seem to have ever been resolved. 
> Are all canonical links still just rejected? Are there any plans to add 
> any of the other options Sebastian Nagel mentions in the last comment?
> 
> Thanks,
> 
> Ben V.
> 
> (p.s. is there a date set for the Nutch 2.4 release?)
>

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.openindex.nutch.parse.relcanonical;

import java.net.URL;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;

import org.apache.hadoop.conf.Configuration;

import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.PluginRepository;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NodeWalker;
import org.apache.nutch.util.URLUtil;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.w3c.dom.*;

/**
 * Handle rel=canonical link and HTTP header. Injects robots=noindex if it
 * doesn't need to be indexed in the meta data and relies on the appropriate
 * flag for the indexer.
 *
 * @author [email protected]
 */
public class RelCanonicalParseFilter implements HtmlParseFilter {

  private Configuration conf;
  private URLNormalizers urlNormalizers;
  private URLFilters urlFilters;
  public static final Logger LOG = LoggerFactory.getLogger(RelCanonicalParseFilter.class);

  /**
   * Default constructor.
   */
  public RelCanonicalParseFilter() { }

  /**
   * Extract a date from the parsed content.
   *
   * @param Content
   * @param ParseResult
   * @param HTMLMetaTags
   * @param DocumentFragment
   * @return ParseResult
   */
  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
    // The base URL
    URL baseUrl = null;
      
    // Get the base URL
    try {
      baseUrl = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {}

    // Contains the canonical if we find any
    String canonical = null;

    // Get the URL
    String url = content.getUrl();

    // Get the parse!
    Parse parse = parseResult.get(url);

    // Ok
    if (parse != null) {
      // Get the URL from the meta link
      canonical = getMetaLink(doc);

      // Got any?
      if (canonical != null) {
        // Resolve, check and normalize and filter
        canonical = resolveCanonicalUrl(baseUrl, url, canonical);
      }

      // Read from HTTP header if we still haven't got any
      if (canonical == null) {
        // Is there a link header in the HTTP header?
        canonical = getHttpHeaderLink(parse);

        // Did we get any?
        if (canonical != null) {
          // Resolve, check and normalize and filter
          canonical = resolveCanonicalUrl(baseUrl, url, canonical);
        }

        // Read from og:url is we still haven't got any
        if (canonical == null) {
          canonical = parse.getData().getParseMeta().get("og:url");

          // Did we get any?
          if (canonical != null) {
            canonical = resolveCanonicalUrl(baseUrl, url, canonical);
          }
        }
      }

      // Did we find a canonical URL?
      if (canonical != null) {        
        // Check for existing robots meta
        String robotsMeta = parse.getData().getParseMeta().get("robots");
        
        // Usual stuff
        if (robotsMeta != null) {
          robotsMeta = robotsMeta.toLowerCase().trim();
        }
        
        // Get parts of the robots meta
        List<String> parts = new ArrayList<String>();
        
        if (robotsMeta != null) {
          Collections.addAll(parts, robotsMeta.split(","));
        }
        
        // Get rid of white space
        for (int i = 0; i < parts.size(); i++) {
          parts.set(i, parts.get(i).trim());
        }
        
        // Get rid of empty stuff
        parts.remove("");
        
        // Remove any existing index directive
        parts.remove("index");
        
        // Add no index only if it isnt there
        if (!parts.contains("noindex")) {
          parts.add("noindex");
        }        
        
        // Have the indexer remove this URL from the index
        parse.getData().getParseMeta().set("robots", join(parts, ","));

        // We'll get our outlinks here
        List<Outlink> outlinks = new ArrayList<Outlink>();

        // Get the outlinks
        Collections.addAll(outlinks, parse.getData().getOutlinks());

        try {
          // Add the canonical URL to the outlinks
          outlinks.add(new Outlink(canonical, ""));

          // Set the outlinks
          parse.getData().setOutlinks(outlinks.toArray(new Outlink[outlinks.size()]));
        } catch (MalformedURLException e) {}
      }
    }

    // Return the whole
    return parseResult;
  }

  /**
   * Resolves, normalizes and filters the URL and checks if it's not equal
   * to the current URL.
   *
   * @param String
   * @param String
   * @return String
   */
  private String resolveCanonicalUrl(URL baseUrl, String url, String canonicalUrl) {
    // Resolve the URL
    try {
      canonicalUrl = URLUtil.resolveURL(baseUrl, canonicalUrl).toString();

      // Do we have a canonical URL
      if (canonicalUrl != null) {
        // Pass the URL through the URL normalizers
        canonicalUrl = normalizeUrl(canonicalUrl);

        // Ok?
        if (canonicalUrl != null) {
          // ..filter
          canonicalUrl = filterUrl(canonicalUrl);

          // Do we still have a canonical URL and it's not the same as the current?
          if (canonicalUrl != null && !canonicalUrl.equals(url)) {
            // Ok...
            return canonicalUrl;
          }
        }
      }
    } catch (MalformedURLException e) {}

    // Got nothing
    return null;
  }

  /**
   * Attempts to read a canonical URL from the HTTP headers
   *
   * @return String
   */
  private String getHttpHeaderLink(Parse parse) {
    // Read the link header
    String link = parse.getData().getContentMeta().get("Link");

    // Get any?
    if (link != null) {
      // Get the parts <http://www.example.org/>; rel="canonical"
      String[] parts = link.split(" ");

      // Got two parts?
      if (parts.length == 2) {
        // Is rel canonical?
        if (parts[1].equalsIgnoreCase("rel=\"canonical\"")) {
          return parts[0].substring(1, parts[0].length() - 2);
        }
      }
    }

    // Got nothing..
    return null;
  }

  /**
   * Finds the specified element and returns its value
   *
   * @param DocumentFragment
   * @return String
   */
  private String getMetaLink(DocumentFragment doc) {
    // We're looking for link elements
    String element = new String("link");
    NodeWalker walker = new NodeWalker(doc);

    try {
      while (walker.hasNext()) {
        Node currentNode = walker.nextNode();

        if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
          if (element.equalsIgnoreCase(currentNode.getNodeName())) {
            HashMap<String,String> atts = getAttributes(currentNode);

            // Must have href and rel
            if (atts.containsKey("href") && atts.containsKey("rel")) {
              // rel must be canonical
              if (atts.get("rel").equalsIgnoreCase("canonical")) {
                // Return the value for href
                return atts.get("href");
              }
            }
          }
        }
      }
    } catch (Exception e) {}

    // Seems nothing is found
    return null;
  }

  /**
   * Returns a key/value map with the attributes of the given Node
   *
   * @param Node
   * @return HashMap<String,String>
   */
  private HashMap<String,String> getAttributes(Node node) {
    HashMap<String,String> attribMap = new HashMap<String,String>();

    NamedNodeMap attributes = node.getAttributes();

    for(int i = 0 ; i < attributes.getLength(); i++) {
      Attr attribute = (Attr)attributes.item(i);
      attribMap.put(attribute.getName().toLowerCase(), attribute.getValue());
    }

    return attribMap;
  }

  /**
    * Normalizes and trims extra whitespace from the given url.
    *
    * @param url The url to normalize.
    * @return The normalized url.
    */
  private String normalizeUrl(String url) {
    String normalized = null;
    if (urlNormalizers != null) {
      try {

        // normalize and trim the url
        normalized = urlNormalizers.normalize(url,
          URLNormalizers.SCOPE_DEFAULT);
        normalized = normalized.trim();
      }
      catch (Exception e) {
        LOG.warn("Skipping " + url + ":" + e);
        normalized = null;
      }
    }
    return normalized;
  }

  /**
   * Filters the given url.
   *
   * @param url The url to filter.
   *
   * @return The filtered url or null.
   */
  private String filterUrl(String url) {
    try {
      url = urlFilters.filter(url);
    } catch (Exception e) {
      url = null;
    }

    return url;
  }

  /**
   * Brrp
   */
  public void setConf(Configuration conf) {
    this.conf = conf;

    urlNormalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
    urlFilters = new URLFilters(conf);
  }

  /**
   * zzz
   */
  public Configuration getConf() {
    return this.conf;
  }
  
  public static String join(List<String> parts, String glue) {
   StringBuilder sb = new StringBuilder();
   
   boolean first = true;
   
   for (String part : parts) {
      if (first) {
         first = false;
      } else {
         sb.append(glue);
      }
      
      sb.append(part);
   }
   return sb.toString();
}
}

RE: rel="canonical" attribute

Reply via email to