Re: HTMLMetaProcessor a bug?

Gal Nitzan Tue, 10 Jan 2006 11:48:21 -0800

Because I needed to add two more fields from the meta tags in the html
page I have revised some of the code in HTMLMetaProcessor and in
DOMContentUtils.


I believe it to be a little more generic than the existing code (look at
DOMContentUtils.GetMetaAttributes) and from the sample here from Jérôme
since the existing code can handle only http-equiv or name...

Since I am not too familiar with svn. I paste it down this email, it
might be useful to someone.

On Tue, 2006-01-10 at 08:48 -0800, Doug Cutting wrote:
> Jérôme Charron wrote:
> > For consistency and to decouple a little Nutch HTML Parser and Xerces
> > implementation, I suggest to change these lines by something like:
> > Node nameNode = null;
> > Node equivNode = null;
> > Node contentNode = null;
> > for (int i=0; i<attrs.getLength(); i++) {
> >   Node attr = attrs.item(i);
> >   String attrName = attr.getNodeName().toLowerCase();
> >   if (attrName.equals("name")) {
> >     nameNode = attr;
> >   } else if (attrName.equals("http-equiv")) {
> >     equivNode = attr;
> >   } else if (attrName.equals("content")) {
> >     contentNode = attr;
> >   }
> > }
> 
> +1
> 


/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.parse.html;

import java.net.URL;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;

import org.apache.nutch.parse.Outlink;

import org.w3c.dom.*;

/**
 * A collection of methods for extracting content from DOM trees.
 * <p/>
 * This class holds a few utility methods for pulling content out of
 * DOM nodes, such as getOutlinks, getText, etc.
 */
public class DOMContentUtils {

  public static class LinkParams {
    public String elName;
    public String attrName;
    public int childLen;

    public LinkParams(String elName, String attrName, int childLen) {
      this.elName = elName;
      this.attrName = attrName;
      this.childLen = childLen;
    }

    public String toString() {
      return "LP[el=" + elName + ",attr=" + attrName + ",len=" +
childLen + "]";
    }
  }

  public static HashMap linkParams = new HashMap();

  static {
    linkParams.put("a", new LinkParams("a", "href", 1));
    linkParams.put("area", new LinkParams("area", "href", 0));
    linkParams.put("form", new LinkParams("form", "action", 1));
    linkParams.put("frame", new LinkParams("frame", "src", 0));
    linkParams.put("iframe", new LinkParams("iframe", "src", 0));
    linkParams.put("script", new LinkParams("script", "src", 0));
    linkParams.put("link", new LinkParams("link", "href", 0));
    linkParams.put("img", new LinkParams("img", "src", 0));
  }

  /**
   * This method takes a [EMAIL PROTECTED] StringBuffer} and a DOM [EMAIL 
PROTECTED] Node},
   * and will append all the content text found beneath the DOM node to
   * the <code>StringBuffer</code>.
   * <p/>
   * <p/>
   * <p/>
   * If <code>abortOnNestedAnchors</code> is true, DOM traversal will
   * be aborted and the <code>StringBuffer</code> will not contain
   * any text encountered after a nested anchor is found.
   * <p/>
   * <p/>
   *
   * @return true if nested anchors were found
   */
  public static final boolean getText(StringBuffer sb, Node node,
                                      boolean abortOnNestedAnchors) {
    if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
      return true;
    }
    return false;
  }


  /**
   * This is a convinience method, equivalent to [EMAIL PROTECTED]
   * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
   */
  public static final void getText(StringBuffer sb, Node node) {
    getText(sb, node, false);
  }

  // returns true if abortOnNestedAnchors is true and we find nested 
  // anchors
  private static final boolean getTextHelper(StringBuffer sb, Node node,
                                             boolean
abortOnNestedAnchors,
                                             int anchorDepth) {
    if ("script".equalsIgnoreCase(node.getNodeName())) {
      return false;
    }
    if ("style".equalsIgnoreCase(node.getNodeName())) {
      return false;
    }
    if (abortOnNestedAnchors &&
"a".equalsIgnoreCase(node.getNodeName())) {
      anchorDepth++;
      if (anchorDepth > 1)
        return true;
    }
    if (node.getNodeType() == Node.COMMENT_NODE) {
      return false;
    }
    if (node.getNodeType() == Node.TEXT_NODE) {
      // cleanup and trim the value
      String text = node.getNodeValue();
      text = text.replaceAll("\\s+", " ");
      text = text.trim();
      if (text.length() > 0) {
        if (sb.length() > 0) sb.append(' ');
        sb.append(text);
      }
    }
    boolean abort = false;
    NodeList children = node.getChildNodes();
    if (children != null) {
      int len = children.getLength();
      for (int i = 0; i < len; i++) {
        if (getTextHelper(sb, children.item(i),
            abortOnNestedAnchors, anchorDepth)) {
          abort = true;
          break;
        }
      }
    }
    return abort;
  }

  /**
   * This method takes a [EMAIL PROTECTED] StringBuffer} and a DOM [EMAIL 
PROTECTED] Node},
   * and will append the content text found beneath the first
   * <code>title</code> node to the <code>StringBuffer</code>.
   *
   * @return true if a title node was found, false otherwise
   */
  public static final boolean getTitle(StringBuffer sb, Node node) {
    if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD
      return false;

    if (node.getNodeType() == Node.ELEMENT_NODE) {
      if ("title".equalsIgnoreCase(node.getNodeName())) {
        getText(sb, node);
        return true;
      }
    }
    NodeList children = node.getChildNodes();
    if (children != null) {
      int len = children.getLength();
      for (int i = 0; i < len; i++) {
        if (getTitle(sb, children.item(i))) {
          return true;
        }
      }
    }
    return false;
  }

  public static final String GetMetaAttributes(Node node, String
nodeName, String nodeValue) {
    String ret = null;
    if ("body".equalsIgnoreCase(node.getNodeName()))
      return ret;

    if (node.getNodeType() == Node.ELEMENT_NODE) {
      if ("meta".equalsIgnoreCase(node.getNodeName())) {
        if (!node.hasAttributes())
          return ret;

        NamedNodeMap attr = node.getAttributes();

        if (attr.getLength() != 2)
          return ret;

        Node n1 = attr.item(0);
        Node n2 = attr.item(1);

        if (nodeName.equalsIgnoreCase(n1.getNodeName()))
        {
          if (!nodeValue.equalsIgnoreCase(n1.getNodeValue()))
            return ret;

          if (!"content".equalsIgnoreCase(n2.getNodeName()))
            return ret;

          ret = n2.getNodeValue().toLowerCase();

          return ret;
        }

        if (nodeName.equalsIgnoreCase(n2.getNodeName()))
        {
          if (!nodeValue.equalsIgnoreCase(n2.getNodeValue()))
            return ret;

          if (!"content".equalsIgnoreCase(n1.getNodeName()))
            return ret;

          ret = n1.getNodeValue().toLowerCase();

          return ret;
        }

        return ret;
      }
    }

    NodeList children = node.getChildNodes();
    if (children != null) {
      int len = children.getLength();
      for (int i = 0; i < len; i++) {
        if ((ret = GetMetaAttributes(children.item(i), nodeName,
nodeValue)) != null) {
          return ret;
        }
      }
    }

    return ret;
  }

  /**
   * If Node contains a BASE tag then it's HREF is returned.
   */
  public static final URL getBase(Node node) {

    // is this node a BASE tag?
    if (node.getNodeType() == Node.ELEMENT_NODE) {

      if ("body".equalsIgnoreCase(node.getNodeName())) // stop after
HEAD
        return null;


      if ("base".equalsIgnoreCase(node.getNodeName())) {
        NamedNodeMap attrs = node.getAttributes();
        for (int i = 0; i < attrs.getLength(); i++) {
          Node attr = attrs.item(i);
          if ("href".equalsIgnoreCase(attr.getNodeName())) {
            try {
              return new URL(attr.getNodeValue());
            } catch (MalformedURLException e) {
            }
          }
        }
      }
    }

    // does it contain a base tag?
    NodeList children = node.getChildNodes();
    if (children != null) {
      int len = children.getLength();
      for (int i = 0; i < len; i++) {
        URL base = getBase(children.item(i));
        if (base != null)
          return base;
      }
    }

    // no.
    return null;
  }


  private static boolean hasOnlyWhiteSpace(Node node) {
    String val = node.getNodeValue();
    for (int i = 0; i < val.length(); i++) {
      if (!Character.isWhitespace(val.charAt(i)))
        return false;
    }
    return true;
  }

  // this only covers a few cases of empty links that are symptomatic
  // of nekohtml's DOM-fixup process...
  private static boolean shouldThrowAwayLink(Node node, NodeList
children,
                                             int childLen, LinkParams
params) {
    if (childLen == 0) {
      // this has no inner structure 
      if (params.childLen == 0) return false;
      else return true;
    } else if ((childLen == 1)
        && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
        &&
(params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
      // single nested link
      return true;

    } else if (childLen == 2) {

      Node c0 = children.item(0);
      Node c1 = children.item(1);

      if ((c0.getNodeType() == Node.ELEMENT_NODE)
          && (params.elName.equalsIgnoreCase(c0.getNodeName()))
          && (c1.getNodeType() == Node.TEXT_NODE)
          && hasOnlyWhiteSpace(c1)) {
        // single link followed by whitespace node
        return true;
      }

      if ((c1.getNodeType() == Node.ELEMENT_NODE)
          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
          && (c0.getNodeType() == Node.TEXT_NODE)
          && hasOnlyWhiteSpace(c0)) {
        // whitespace node followed by single link
        return true;
      }

    } else if (childLen == 3) {
      Node c0 = children.item(0);
      Node c1 = children.item(1);
      Node c2 = children.item(2);

      if ((c1.getNodeType() == Node.ELEMENT_NODE)
          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
          && (c0.getNodeType() == Node.TEXT_NODE)
          && (c2.getNodeType() == Node.TEXT_NODE)
          && hasOnlyWhiteSpace(c0)
          && hasOnlyWhiteSpace(c2)) {
        // single link surrounded by whitespace nodes
        return true;
      }
    }

    return false;
  }

  /**
   * This method finds all anchors below the supplied DOM
   * <code>node</code>, and creates appropriate [EMAIL PROTECTED] Outlink}
   * records for each (relative to the supplied <code>base</code>
   * URL), and adds them to the <code>outlinks</code> [EMAIL PROTECTED]
   * ArrayList}.
   * <p/>
   * <p/>
   * <p/>
   * Links without inner structure (tags, text, etc) are discarded, as
   * are links which contain only single nested links and empty text
   * nodes (this is a common DOM-fixup artifact, at least with
   * nekohtml).
   */
  public static final void getOutlinks(URL base, ArrayList outlinks,
                                       Node node) {

    NodeList children = node.getChildNodes();
    int childLen = 0;
    if (children != null)
      childLen = children.getLength();

    if (node.getNodeType() == Node.ELEMENT_NODE) {
      LinkParams params = (LinkParams)
linkParams.get(node.getNodeName().toLowerCase());
      if (params != null) {
        if (!shouldThrowAwayLink(node, children, childLen, params)) {

          StringBuffer linkText = new StringBuffer();
          getText(linkText, node, true);

          NamedNodeMap attrs = node.getAttributes();
          String target = null;
          boolean noFollow = false;
          boolean post = false;
          for (int i = 0; i < attrs.getLength(); i++) {
            Node attr = attrs.item(i);
            String attrName = attr.getNodeName();
            if (params.attrName.equalsIgnoreCase(attrName)) {
              target = attr.getNodeValue();
            } else if ("rel".equalsIgnoreCase(attrName) &&
                "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
              noFollow = true;
            } else if ("method".equalsIgnoreCase(attrName) &&
                "post".equalsIgnoreCase(attr.getNodeValue())) {
              post = true;
            }
          }
          if (target != null && !noFollow && !post)
            try {
              URL url = new URL(base, target);
              outlinks.add(new Outlink(url.toString(),
                  linkText.toString().trim()));
            } catch (MalformedURLException e) {
              // don't care
            }
        }
        // this should not have any children, skip them
        if (params.childLen == 0) return;
      }
    }
    for (int i = 0; i < childLen; i++) {
      getOutlinks(base, outlinks, children.item(i));
    }
  }

}

-------------------------------------
/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.parse.html;

import java.net.URL;
import java.util.Properties;

import org.apache.nutch.parse.HTMLMetaTags;
import org.w3c.dom.*;

/**
 * Class for parsing META Directives from DOM trees.  This class
 * handles specifically Robots META directives (all, none, nofollow,
 * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
 * instructions. All meta directives are stored in a HTMLMetaTags
instance.
 */
public class HTMLMetaProcessor {

  /**
   * Utility class with indicators for the robots directives "noindex"
   * and "nofollow", and HTTP-EQUIV/no-cache
   */

  /**
   * Sets the indicators in <code>robotsMeta</code> to appropriate
   * values, based on any META tags found under the given
   * <code>node</code>.
   */
  public static final void getMetaTags(
      HTMLMetaTags metaTags, Node node, URL currURL) {

    metaTags.reset();
    getMetaTagsHelper(metaTags, node, currURL);
  }

  /**
   * Collect available meta tags from HTML page
   * @param metaTags
   * @param node
   * @param currURL
   */
  private static final void getMetaTagsHelper(HTMLMetaTags metaTags,
Node node, URL currURL) {
    String content;
    int index;

    content = DOMContentUtils.GetMetaAttributes(node, "name",
"description");
    metaTags.setDescription(content);

    content = DOMContentUtils.GetMetaAttributes(node, "name",
"keywords");
    metaTags.setKeywords(content);

    content = DOMContentUtils.GetMetaAttributes(node, "http-equiv",
"pragma");
    if (content != null) {
      index = content.indexOf("no-cache");
      if (index >= 0)
        metaTags.setNoCache();
    }

    content = DOMContentUtils.GetMetaAttributes(node, "http-equiv",
"refresh");
    if (content != null) {
      index = content.indexOf(';');

      String time = null;
      if (index == -1) { // just the refresh time
        time = content;
      } else
        time = content.substring(0, index);
      try {
        metaTags.setRefreshTime(Integer.parseInt(time));
        // skip this if we couldn't parse the time
        metaTags.setRefresh(true);
      } catch (Exception e) {
        ;
      }

      URL refreshUrl = null;
      if (metaTags.getRefresh() && index != -1) { // set the URL
        index = content.indexOf("url=");
        if (index == -1) { // assume a mis-formatted entry with just the
url
          index = content.indexOf(';') + 1;
        } else index += 4;
        if (index != -1) {
          String url = content.substring(index);
          try {
            refreshUrl = new URL(url);
          } catch (Exception e) {
            // XXX according to the spec, this has to be an absolute
            // XXX url. However, many websites use relative URLs and
            // XXX expect browsers to handle that.
            // XXX Unfortunately, in some cases this may create a
            // XXX infinitely recursive paths (a crawler trap)...
            // if (!url.startsWith("/")) url = "/" + url;
            try {
              refreshUrl = new URL(currURL, url);
            } catch (Exception e1) {
              refreshUrl = null;
            }
          }
        }
      }
      if (metaTags.getRefresh()) {
        if (refreshUrl == null) {
          // apparently only refresh time was present. set the URL
          // to the same URL.
          refreshUrl = currURL;
        }
        metaTags.setRefreshHref(refreshUrl);
      }
    } // refresh

    content = DOMContentUtils.GetMetaAttributes(node, "name", "robots");
    if (content != null) {
      index = content.indexOf("none");
      if (index >= 0) {
        metaTags.setNoIndex();
        metaTags.setNoFollow();
      }
      index = content.indexOf("all");
      if (index >= 0) {
        // do nothing...
      }
      index = content.indexOf("noindex");
      if (index >= 0) {
        metaTags.setNoIndex();
      }

      index = content.indexOf("nofollow");
      if (index >= 0) {
        metaTags.setNoFollow();
      }
    }

    URL url = DOMContentUtils.getBase(node);

    if (url != null) {
      metaTags.setBaseHref(url);
    }
  }
}

Re: HTMLMetaProcessor a bug?

Reply via email to