[Nutch-dev] Re: HTMLMetaProcessor a bug?

Gal Nitzan Tue, 10 Jan 2006 11:49:01 -0800

Because I needed to add two more fields from the meta tags in the html
page I have revised some of the code in HTMLMetaProcessor and in
DOMContentUtils.


I believe it to be a little more generic than the existing code (look at
DOMContentUtils.GetMetaAttributes) and from the sample here from Jérôme
since the existing code can handle only http-equiv or name...

Since I am not too familiar with svn. I paste it down this email, it
might be useful to someone.

On Tue, 2006-01-10 at 08:48 -0800, Doug Cutting wrote:
> Jérôme Charron wrote:
> > For consistency and to decouple a little Nutch HTML Parser and Xerces
> > implementation, I suggest to change these lines by something like:
> > Node nameNode = null;
> > Node equivNode = null;
> > Node contentNode = null;
> > for (int i=0; i<attrs.getLength(); i++) {
> >   Node attr = attrs.item(i);
> >   String attrName = attr.getNodeName().toLowerCase();
> >   if (attrName.equals("name")) {
> >     nameNode = attr;
> >   } else if (attrName.equals("http-equiv")) {
> >     equivNode = attr;
> >   } else if (attrName.equals("content")) {
> >     contentNode = attr;
> >   }
> > }
> 
> +1
> 


/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.parse.html;

import java.net.URL;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;

import org.apache.nutch.parse.Outlink;

import org.w3c.dom.*;

/**
 * A collection of methods for extracting content from DOM trees.
 * <p/>
 * This class holds a few utility methods for pulling content out of
 * DOM nodes, such as getOutlinks, getText, etc.
 */
public class DOMContentUtils {

  public static class LinkParams {
    public String elName;
    public String attrName;
    public int childLen;

    public LinkParams(String elName, String attrName, int childLen) {
      this.elName = elName;
      this.attrName = attrName;
      this.childLen = childLen;
    }

    public String toString() {
      return "LP[el=" + elName + ",attr=" + attrName + ",len=" +
childLen + "]";
    }
  }

  public static HashMap linkParams = new HashMap();

  static {
    linkParams.put("a", new LinkParams("a", "href", 1));
    linkParams.put("area", new LinkParams("area", "href", 0));
    linkParams.put("form", new LinkParams("form", "action", 1));
    linkParams.put("frame", new LinkParams("frame", "src", 0));
    linkParams.put("iframe", new LinkParams("iframe", "src", 0));
    linkParams.put("script", new LinkParams("script", "src", 0));
    linkParams.put("link", new LinkParams("link", "href", 0));
    linkParams.put("img", new LinkParams("img", "src", 0));
  }

  /**
   * This method takes a [EMAIL PROTECTED] StringBuffer} and a DOM [EMAIL 
PROTECTED] Node},
   * and will append all the content text found beneath the DOM node to
   * the <code>StringBuffer</code>.
   * <p/>
   * <p/>
   * <p/>
   * If <code>abortOnNestedAnchors</code> is true, DOM traversal will
   * be aborted and the <code>StringBuffer</code> will not contain
   * any text encountered after a nested anchor is found.
   * <p/>
   * <p/>
   *
   * @return true if nested anchors were found
   */
  public static final boolean getText(StringBuffer sb, Node node,
                                      boolean abortOnNestedAnchors) {
    if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
      return true;
    }
    return false;
  }


  /**
   * This is a convinience method, equivalent to [EMAIL PROTECTED]
   * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
   */
  public static final void getText(StringBuffer sb, Node node) {
    getText(sb, node, false);
  }

  // returns true if abortOnNestedAnchors is true and we find nested 
  // anchors
  private static final boolean getTextHelper(StringBuffer sb, Node node,
                                             boolean
abortOnNestedAnchors,
                                             int anchorDepth) {
    if ("script".equalsIgnoreCase(node.getNodeName())) {
      return false;
    }
    if ("style".equalsIgnoreCase(node.getNodeName())) {
      return false;
    }
    if (abortOnNestedAnchors &&
"a".equalsIgnoreCase(node.getNodeName())) {
      anchorDepth++;
      if (anchorDepth > 1)
        return true;
    }
    if (node.getNodeType() == Node.COMMENT_NODE) {
      return false;
    }
    if (node.getNodeType() == Node.TEXT_NODE) {
      // cleanup and trim the value
      String text = node.getNodeValue();
      text = text.replaceAll("\\s+", " ");
      text = text.trim();
      if (text.length() > 0) {
        if (sb.length() > 0) sb.append(' ');
        sb.append(text);
      }
    }
    boolean abort = false;
    NodeList children = node.getChildNodes();
    if (children != null) {
      int len = children.getLength();
      for (int i = 0; i < len; i++) {
        if (getTextHelper(sb, children.item(i),
            abortOnNestedAnchors, anchorDepth)) {
          abort = true;
          break;
        }
      }
    }
    return abort;
  }

  /**
   * This method takes a [EMAIL PROTECTED] StringBuffer} and a DOM [EMAIL 
PROTECTED] Node},
   * and will append the content text found beneath the first
   * <code>title</code> node to the <code>StringBuffer</code>.
   *
   * @return true if a title node was found, false otherwise
   */
  public static final boolean getTitle(StringBuffer sb, Node node) {
    if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD
      return false;

    if (node.getNodeType() == Node.ELEMENT_NODE) {
      if ("title".equalsIgnoreCase(node.getNodeName())) {
        getText(sb, node);
        return true;
      }
    }
    NodeList children = node.getChildNodes();
    if (children != null) {
      int len = children.getLength();
      for (int i = 0; i < len; i++) {
        if (getTitle(sb, children.item(i))) {
          return true;
        }
      }
    }
    return false;
  }

  public static final String GetMetaAttributes(Node node, String
nodeName, String nodeValue) {
    String ret = null;
    if ("body".equalsIgnoreCase(node.getNodeName()))
      return ret;

    if (node.getNodeType() == Node.ELEMENT_NODE) {
      if ("meta".equalsIgnoreCase(node.getNodeName())) {
        if (!node.hasAttributes())
          return ret;

        NamedNodeMap attr = node.getAttributes();

        if (attr.getLength() != 2)
          return ret;

        Node n1 = attr.item(0);
        Node n2 = attr.item(1);

        if (nodeName.equalsIgnoreCase(n1.getNodeName()))
        {
          if (!nodeValue.equalsIgnoreCase(n1.getNodeValue()))
            return ret;

          if (!"content".equalsIgnoreCase(n2.getNodeName()))
            return ret;

          ret = n2.getNodeValue().toLowerCase();

          return ret;
        }

        if (nodeName.equalsIgnoreCase(n2.getNodeName()))
        {
          if (!nodeValue.equalsIgnoreCase(n2.getNodeValue()))
            return ret;

          if (!"content".equalsIgnoreCase(n1.getNodeName()))
            return ret;

          ret = n1.getNodeValue().toLowerCase();

          return ret;
        }

        return ret;
      }
    }

    NodeList children = node.getChildNodes();
    if (children != null) {
      int len = children.getLength();
      for (int i = 0; i < len; i++) {
        if ((ret = GetMetaAttributes(children.item(i), nodeName,
nodeValue)) != null) {
          return ret;
        }
      }
    }

    return ret;
  }

  /**
   * If Node contains a BASE tag then it's HREF is returned.
   */
  public static final URL getBase(Node node) {

    // is this node a BASE tag?
    if (node.getNodeType() == Node.ELEMENT_NODE) {

      if ("body".equalsIgnoreCase(node.getNodeName())) // stop after
HEAD
        return null;


      if ("base".equalsIgnoreCase(node.getNodeName())) {
        NamedNodeMap attrs = node.getAttributes();
        for (int i = 0; i < attrs.getLength(); i++) {
          Node attr = attrs.item(i);
          if ("href".equalsIgnoreCase(attr.getNodeName())) {
            try {
              return new URL(attr.getNodeValue());
            } catch (MalformedURLException e) {
            }
          }
        }
      }
    }

    // does it contain a base tag?
    NodeList children = node.getChildNodes();
    if (children != null) {
      int len = children.getLength();
      for (int i = 0; i < len; i++) {
        URL base = getBase(children.item(i));
        if (base != null)
          return base;
      }
    }

    // no.
    return null;
  }


  private static boolean hasOnlyWhiteSpace(Node node) {
    String val = node.getNodeValue();
    for (int i = 0; i < val.length(); i++) {
      if (!Character.isWhitespace(val.charAt(i)))
        return false;
    }
    return true;
  }

  // this only covers a few cases of empty links that are symptomatic
  // of nekohtml's DOM-fixup process...
  private static boolean shouldThrowAwayLink(Node node, NodeList
children,
                                             int childLen, LinkParams
params) {
    if (childLen == 0) {
      // this has no inner structure 
      if (params.childLen == 0) return false;
      else return true;
    } else if ((childLen == 1)
        && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
        &&
(params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
      // single nested link
      return true;

    } else if (childLen == 2) {

      Node c0 = children.item(0);
      Node c1 = children.item(1);

      if ((c0.getNodeType() == Node.ELEMENT_NODE)
          && (params.elName.equalsIgnoreCase(c0.getNodeName()))
          && (c1.getNodeType() == Node.TEXT_NODE)
          && hasOnlyWhiteSpace(c1)) {
        // single link followed by whitespace node
        return true;
      }

      if ((c1.getNodeType() == Node.ELEMENT_NODE)
          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
          && (c0.getNodeType() == Node.TEXT_NODE)
          && hasOnlyWhiteSpace(c0)) {
        // whitespace node followed by single link
        return true;
      }

    } else if (childLen == 3) {
      Node c0 = children.item(0);
      Node c1 = children.item(1);
      Node c2 = children.item(2);

      if ((c1.getNodeType() == Node.ELEMENT_NODE)
          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
          && (c0.getNodeType() == Node.TEXT_NODE)
          && (c2.getNodeType() == Node.TEXT_NODE)
          && hasOnlyWhiteSpace(c0)
          && hasOnlyWhiteSpace(c2)) {
        // single link surrounded by whitespace nodes
        return true;
      }
    }

    return false;
  }

  /**
   * This method finds all anchors below the supplied DOM
   * <code>node</code>, and creates appropriate [EMAIL PROTECTED] Outlink}
   * records for each (relative to the supplied <code>base</code>
   * URL), and adds them to the <code>outlinks</code> [EMAIL PROTECTED]
   * ArrayList}.
   * <p/>
   * <p/>
   * <p/>
   * Links without inner structure (tags, text, etc) are discarded, as
   * are links which contain only single nested links and empty text
   * nodes (this is a common DOM-fixup artifact, at least with
   * nekohtml).
   */
  public static final void getOutlinks(URL base, ArrayList outlinks,
                                       Node node) {

    NodeList children = node.getChildNodes();
    int childLen = 0;
    if (children != null)
      childLen = children.getLength();

    if (node.getNodeType() == Node.ELEMENT_NODE) {
      LinkParams params = (LinkParams)
linkParams.get(node.getNodeName().toLowerCase());
      if (params != null) {
        if (!shouldThrowAwayLink(node, children, childLen, params)) {

          StringBuffer linkText = new StringBuffer();
          getText(linkText, node, true);

          NamedNodeMap attrs = node.getAttributes();
          String target = null;
          boolean noFollow = false;
          boolean post = false;
          for (int i = 0; i < attrs.getLength(); i++) {
            Node attr = attrs.item(i);
            String attrName = attr.getNodeName();
            if (params.attrName.equalsIgnoreCase(attrName)) {
              target = attr.getNodeValue();
            } else if ("rel".equalsIgnoreCase(attrName) &&
                "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
              noFollow = true;
            } else if ("method".equalsIgnoreCase(attrName) &&
                "post".equalsIgnoreCase(attr.getNodeValue())) {
              post = true;
            }
          }
          if (target != null && !noFollow && !post)
            try {
              URL url = new URL(base, target);
              outlinks.add(new Outlink(url.toString(),
                  linkText.toString().trim()));
            } catch (MalformedURLException e) {
              // don't care
            }
        }
        // this should not have any children, skip them
        if (params.childLen == 0) return;
      }
    }
    for (int i = 0; i < childLen; i++) {
      getOutlinks(base, outlinks, children.item(i));
    }
  }

}

-------------------------------------
/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.parse.html;

import java.net.URL;
import java.util.Properties;

import org.apache.nutch.parse.HTMLMetaTags;
import org.w3c.dom.*;

/**
 * Class for parsing META Directives from DOM trees.  This class
 * handles specifically Robots META directives (all, none, nofollow,
 * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
 * instructions. All meta directives are stored in a HTMLMetaTags
instance.
 */
public class HTMLMetaProcessor {

  /**
   * Utility class with indicators for the robots directives "noindex"
   * and "nofollow", and HTTP-EQUIV/no-cache
   */

  /**
   * Sets the indicators in <code>robotsMeta</code> to appropriate
   * values, based on any META tags found under the given
   * <code>node</code>.
   */
  public static final void getMetaTags(
      HTMLMetaTags metaTags, Node node, URL currURL) {

    metaTags.reset();
    getMetaTagsHelper(metaTags, node, currURL);
  }

  /**
   * Collect available meta tags from HTML page
   * @param metaTags
   * @param node
   * @param currURL
   */
  private static final void getMetaTagsHelper(HTMLMetaTags metaTags,
Node node, URL currURL) {
    String content;
    int index;

    content = DOMContentUtils.GetMetaAttributes(node, "name",
"description");
    metaTags.setDescription(content);

    content = DOMContentUtils.GetMetaAttributes(node, "name",
"keywords");
    metaTags.setKeywords(content);

    content = DOMContentUtils.GetMetaAttributes(node, "http-equiv",
"pragma");
    if (content != null) {
      index = content.indexOf("no-cache");
      if (index >= 0)
        metaTags.setNoCache();
    }

    content = DOMContentUtils.GetMetaAttributes(node, "http-equiv",
"refresh");
    if (content != null) {
      index = content.indexOf(';');

      String time = null;
      if (index == -1) { // just the refresh time
        time = content;
      } else
        time = content.substring(0, index);
      try {
        metaTags.setRefreshTime(Integer.parseInt(time));
        // skip this if we couldn't parse the time
        metaTags.setRefresh(true);
      } catch (Exception e) {
        ;
      }

      URL refreshUrl = null;
      if (metaTags.getRefresh() && index != -1) { // set the URL
        index = content.indexOf("url=");
        if (index == -1) { // assume a mis-formatted entry with just the
url
          index = content.indexOf(';') + 1;
        } else index += 4;
        if (index != -1) {
          String url = content.substring(index);
          try {
            refreshUrl = new URL(url);
          } catch (Exception e) {
            // XXX according to the spec, this has to be an absolute
            // XXX url. However, many websites use relative URLs and
            // XXX expect browsers to handle that.
            // XXX Unfortunately, in some cases this may create a
            // XXX infinitely recursive paths (a crawler trap)...
            // if (!url.startsWith("/")) url = "/" + url;
            try {
              refreshUrl = new URL(currURL, url);
            } catch (Exception e1) {
              refreshUrl = null;
            }
          }
        }
      }
      if (metaTags.getRefresh()) {
        if (refreshUrl == null) {
          // apparently only refresh time was present. set the URL
          // to the same URL.
          refreshUrl = currURL;
        }
        metaTags.setRefreshHref(refreshUrl);
      }
    } // refresh

    content = DOMContentUtils.GetMetaAttributes(node, "name", "robots");
    if (content != null) {
      index = content.indexOf("none");
      if (index >= 0) {
        metaTags.setNoIndex();
        metaTags.setNoFollow();
      }
      index = content.indexOf("all");
      if (index >= 0) {
        // do nothing...
      }
      index = content.indexOf("noindex");
      if (index >= 0) {
        metaTags.setNoIndex();
      }

      index = content.indexOf("nofollow");
      if (index >= 0) {
        metaTags.setNoFollow();
      }
    }

    URL url = DOMContentUtils.getBase(node);

    if (url != null) {
      metaTags.setBaseHref(url);
    }
  }
}





-------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc. Do you grep through log files
for problems?  Stop!  Download the new AJAX search engine that makes
searching your log files as easy as surfing the  web.  DOWNLOAD SPLUNK!
http://ads.osdn.com/?ad_id=7637&alloc_id=16865&op=click
_______________________________________________
Nutch-developers mailing list
Nutch-developers@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-developers

[Nutch-dev] Re: HTMLMetaProcessor a bug?

Reply via email to