Because I needed to add two more fields from the meta tags in the html
page I have revised some of the code in HTMLMetaProcessor and in
DOMContentUtils.
I believe it to be a little more generic than the existing code (look at
DOMContentUtils.GetMetaAttributes) and from the sample here from Jérôme
since the existing code can handle only http-equiv or name...
Since I am not too familiar with svn. I paste it down this email, it
might be useful to someone.
On Tue, 2006-01-10 at 08:48 -0800, Doug Cutting wrote:
> Jérôme Charron wrote:
> > For consistency and to decouple a little Nutch HTML Parser and Xerces
> > implementation, I suggest to change these lines by something like:
> > Node nameNode = null;
> > Node equivNode = null;
> > Node contentNode = null;
> > for (int i=0; i<attrs.getLength(); i++) {
> > Node attr = attrs.item(i);
> > String attrName = attr.getNodeName().toLowerCase();
> > if (attrName.equals("name")) {
> > nameNode = attr;
> > } else if (attrName.equals("http-equiv")) {
> > equivNode = attr;
> > } else if (attrName.equals("content")) {
> > contentNode = attr;
> > }
> > }
>
> +1
>
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.html;
import java.net.URL;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import org.apache.nutch.parse.Outlink;
import org.w3c.dom.*;
/**
* A collection of methods for extracting content from DOM trees.
* <p/>
* This class holds a few utility methods for pulling content out of
* DOM nodes, such as getOutlinks, getText, etc.
*/
public class DOMContentUtils {
public static class LinkParams {
public String elName;
public String attrName;
public int childLen;
public LinkParams(String elName, String attrName, int childLen) {
this.elName = elName;
this.attrName = attrName;
this.childLen = childLen;
}
public String toString() {
return "LP[el=" + elName + ",attr=" + attrName + ",len=" +
childLen + "]";
}
}
public static HashMap linkParams = new HashMap();
static {
linkParams.put("a", new LinkParams("a", "href", 1));
linkParams.put("area", new LinkParams("area", "href", 0));
linkParams.put("form", new LinkParams("form", "action", 1));
linkParams.put("frame", new LinkParams("frame", "src", 0));
linkParams.put("iframe", new LinkParams("iframe", "src", 0));
linkParams.put("script", new LinkParams("script", "src", 0));
linkParams.put("link", new LinkParams("link", "href", 0));
linkParams.put("img", new LinkParams("img", "src", 0));
}
/**
* This method takes a [EMAIL PROTECTED] StringBuffer} and a DOM [EMAIL
PROTECTED] Node},
* and will append all the content text found beneath the DOM node to
* the <code>StringBuffer</code>.
* <p/>
* <p/>
* <p/>
* If <code>abortOnNestedAnchors</code> is true, DOM traversal will
* be aborted and the <code>StringBuffer</code> will not contain
* any text encountered after a nested anchor is found.
* <p/>
* <p/>
*
* @return true if nested anchors were found
*/
public static final boolean getText(StringBuffer sb, Node node,
boolean abortOnNestedAnchors) {
if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
return true;
}
return false;
}
/**
* This is a convinience method, equivalent to [EMAIL PROTECTED]
* #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
*/
public static final void getText(StringBuffer sb, Node node) {
getText(sb, node, false);
}
// returns true if abortOnNestedAnchors is true and we find nested
// anchors
private static final boolean getTextHelper(StringBuffer sb, Node node,
boolean
abortOnNestedAnchors,
int anchorDepth) {
if ("script".equalsIgnoreCase(node.getNodeName())) {
return false;
}
if ("style".equalsIgnoreCase(node.getNodeName())) {
return false;
}
if (abortOnNestedAnchors &&
"a".equalsIgnoreCase(node.getNodeName())) {
anchorDepth++;
if (anchorDepth > 1)
return true;
}
if (node.getNodeType() == Node.COMMENT_NODE) {
return false;
}
if (node.getNodeType() == Node.TEXT_NODE) {
// cleanup and trim the value
String text = node.getNodeValue();
text = text.replaceAll("\\s+", " ");
text = text.trim();
if (text.length() > 0) {
if (sb.length() > 0) sb.append(' ');
sb.append(text);
}
}
boolean abort = false;
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
if (getTextHelper(sb, children.item(i),
abortOnNestedAnchors, anchorDepth)) {
abort = true;
break;
}
}
}
return abort;
}
/**
* This method takes a [EMAIL PROTECTED] StringBuffer} and a DOM [EMAIL
PROTECTED] Node},
* and will append the content text found beneath the first
* <code>title</code> node to the <code>StringBuffer</code>.
*
* @return true if a title node was found, false otherwise
*/
public static final boolean getTitle(StringBuffer sb, Node node) {
if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD
return false;
if (node.getNodeType() == Node.ELEMENT_NODE) {
if ("title".equalsIgnoreCase(node.getNodeName())) {
getText(sb, node);
return true;
}
}
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
if (getTitle(sb, children.item(i))) {
return true;
}
}
}
return false;
}
public static final String GetMetaAttributes(Node node, String
nodeName, String nodeValue) {
String ret = null;
if ("body".equalsIgnoreCase(node.getNodeName()))
return ret;
if (node.getNodeType() == Node.ELEMENT_NODE) {
if ("meta".equalsIgnoreCase(node.getNodeName())) {
if (!node.hasAttributes())
return ret;
NamedNodeMap attr = node.getAttributes();
if (attr.getLength() != 2)
return ret;
Node n1 = attr.item(0);
Node n2 = attr.item(1);
if (nodeName.equalsIgnoreCase(n1.getNodeName()))
{
if (!nodeValue.equalsIgnoreCase(n1.getNodeValue()))
return ret;
if (!"content".equalsIgnoreCase(n2.getNodeName()))
return ret;
ret = n2.getNodeValue().toLowerCase();
return ret;
}
if (nodeName.equalsIgnoreCase(n2.getNodeName()))
{
if (!nodeValue.equalsIgnoreCase(n2.getNodeValue()))
return ret;
if (!"content".equalsIgnoreCase(n1.getNodeName()))
return ret;
ret = n1.getNodeValue().toLowerCase();
return ret;
}
return ret;
}
}
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
if ((ret = GetMetaAttributes(children.item(i), nodeName,
nodeValue)) != null) {
return ret;
}
}
}
return ret;
}
/**
* If Node contains a BASE tag then it's HREF is returned.
*/
public static final URL getBase(Node node) {
// is this node a BASE tag?
if (node.getNodeType() == Node.ELEMENT_NODE) {
if ("body".equalsIgnoreCase(node.getNodeName())) // stop after
HEAD
return null;
if ("base".equalsIgnoreCase(node.getNodeName())) {
NamedNodeMap attrs = node.getAttributes();
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
if ("href".equalsIgnoreCase(attr.getNodeName())) {
try {
return new URL(attr.getNodeValue());
} catch (MalformedURLException e) {
}
}
}
}
}
// does it contain a base tag?
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
URL base = getBase(children.item(i));
if (base != null)
return base;
}
}
// no.
return null;
}
private static boolean hasOnlyWhiteSpace(Node node) {
String val = node.getNodeValue();
for (int i = 0; i < val.length(); i++) {
if (!Character.isWhitespace(val.charAt(i)))
return false;
}
return true;
}
// this only covers a few cases of empty links that are symptomatic
// of nekohtml's DOM-fixup process...
private static boolean shouldThrowAwayLink(Node node, NodeList
children,
int childLen, LinkParams
params) {
if (childLen == 0) {
// this has no inner structure
if (params.childLen == 0) return false;
else return true;
} else if ((childLen == 1)
&& (children.item(0).getNodeType() == Node.ELEMENT_NODE)
&&
(params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
// single nested link
return true;
} else if (childLen == 2) {
Node c0 = children.item(0);
Node c1 = children.item(1);
if ((c0.getNodeType() == Node.ELEMENT_NODE)
&& (params.elName.equalsIgnoreCase(c0.getNodeName()))
&& (c1.getNodeType() == Node.TEXT_NODE)
&& hasOnlyWhiteSpace(c1)) {
// single link followed by whitespace node
return true;
}
if ((c1.getNodeType() == Node.ELEMENT_NODE)
&& (params.elName.equalsIgnoreCase(c1.getNodeName()))
&& (c0.getNodeType() == Node.TEXT_NODE)
&& hasOnlyWhiteSpace(c0)) {
// whitespace node followed by single link
return true;
}
} else if (childLen == 3) {
Node c0 = children.item(0);
Node c1 = children.item(1);
Node c2 = children.item(2);
if ((c1.getNodeType() == Node.ELEMENT_NODE)
&& (params.elName.equalsIgnoreCase(c1.getNodeName()))
&& (c0.getNodeType() == Node.TEXT_NODE)
&& (c2.getNodeType() == Node.TEXT_NODE)
&& hasOnlyWhiteSpace(c0)
&& hasOnlyWhiteSpace(c2)) {
// single link surrounded by whitespace nodes
return true;
}
}
return false;
}
/**
* This method finds all anchors below the supplied DOM
* <code>node</code>, and creates appropriate [EMAIL PROTECTED] Outlink}
* records for each (relative to the supplied <code>base</code>
* URL), and adds them to the <code>outlinks</code> [EMAIL PROTECTED]
* ArrayList}.
* <p/>
* <p/>
* <p/>
* Links without inner structure (tags, text, etc) are discarded, as
* are links which contain only single nested links and empty text
* nodes (this is a common DOM-fixup artifact, at least with
* nekohtml).
*/
public static final void getOutlinks(URL base, ArrayList outlinks,
Node node) {
NodeList children = node.getChildNodes();
int childLen = 0;
if (children != null)
childLen = children.getLength();
if (node.getNodeType() == Node.ELEMENT_NODE) {
LinkParams params = (LinkParams)
linkParams.get(node.getNodeName().toLowerCase());
if (params != null) {
if (!shouldThrowAwayLink(node, children, childLen, params)) {
StringBuffer linkText = new StringBuffer();
getText(linkText, node, true);
NamedNodeMap attrs = node.getAttributes();
String target = null;
boolean noFollow = false;
boolean post = false;
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName();
if (params.attrName.equalsIgnoreCase(attrName)) {
target = attr.getNodeValue();
} else if ("rel".equalsIgnoreCase(attrName) &&
"nofollow".equalsIgnoreCase(attr.getNodeValue())) {
noFollow = true;
} else if ("method".equalsIgnoreCase(attrName) &&
"post".equalsIgnoreCase(attr.getNodeValue())) {
post = true;
}
}
if (target != null && !noFollow && !post)
try {
URL url = new URL(base, target);
outlinks.add(new Outlink(url.toString(),
linkText.toString().trim()));
} catch (MalformedURLException e) {
// don't care
}
}
// this should not have any children, skip them
if (params.childLen == 0) return;
}
}
for (int i = 0; i < childLen; i++) {
getOutlinks(base, outlinks, children.item(i));
}
}
}
-------------------------------------
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.html;
import java.net.URL;
import java.util.Properties;
import org.apache.nutch.parse.HTMLMetaTags;
import org.w3c.dom.*;
/**
* Class for parsing META Directives from DOM trees. This class
* handles specifically Robots META directives (all, none, nofollow,
* noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
* instructions. All meta directives are stored in a HTMLMetaTags
instance.
*/
public class HTMLMetaProcessor {
/**
* Utility class with indicators for the robots directives "noindex"
* and "nofollow", and HTTP-EQUIV/no-cache
*/
/**
* Sets the indicators in <code>robotsMeta</code> to appropriate
* values, based on any META tags found under the given
* <code>node</code>.
*/
public static final void getMetaTags(
HTMLMetaTags metaTags, Node node, URL currURL) {
metaTags.reset();
getMetaTagsHelper(metaTags, node, currURL);
}
/**
* Collect available meta tags from HTML page
* @param metaTags
* @param node
* @param currURL
*/
private static final void getMetaTagsHelper(HTMLMetaTags metaTags,
Node node, URL currURL) {
String content;
int index;
content = DOMContentUtils.GetMetaAttributes(node, "name",
"description");
metaTags.setDescription(content);
content = DOMContentUtils.GetMetaAttributes(node, "name",
"keywords");
metaTags.setKeywords(content);
content = DOMContentUtils.GetMetaAttributes(node, "http-equiv",
"pragma");
if (content != null) {
index = content.indexOf("no-cache");
if (index >= 0)
metaTags.setNoCache();
}
content = DOMContentUtils.GetMetaAttributes(node, "http-equiv",
"refresh");
if (content != null) {
index = content.indexOf(';');
String time = null;
if (index == -1) { // just the refresh time
time = content;
} else
time = content.substring(0, index);
try {
metaTags.setRefreshTime(Integer.parseInt(time));
// skip this if we couldn't parse the time
metaTags.setRefresh(true);
} catch (Exception e) {
;
}
URL refreshUrl = null;
if (metaTags.getRefresh() && index != -1) { // set the URL
index = content.indexOf("url=");
if (index == -1) { // assume a mis-formatted entry with just the
url
index = content.indexOf(';') + 1;
} else index += 4;
if (index != -1) {
String url = content.substring(index);
try {
refreshUrl = new URL(url);
} catch (Exception e) {
// XXX according to the spec, this has to be an absolute
// XXX url. However, many websites use relative URLs and
// XXX expect browsers to handle that.
// XXX Unfortunately, in some cases this may create a
// XXX infinitely recursive paths (a crawler trap)...
// if (!url.startsWith("/")) url = "/" + url;
try {
refreshUrl = new URL(currURL, url);
} catch (Exception e1) {
refreshUrl = null;
}
}
}
}
if (metaTags.getRefresh()) {
if (refreshUrl == null) {
// apparently only refresh time was present. set the URL
// to the same URL.
refreshUrl = currURL;
}
metaTags.setRefreshHref(refreshUrl);
}
} // refresh
content = DOMContentUtils.GetMetaAttributes(node, "name", "robots");
if (content != null) {
index = content.indexOf("none");
if (index >= 0) {
metaTags.setNoIndex();
metaTags.setNoFollow();
}
index = content.indexOf("all");
if (index >= 0) {
// do nothing...
}
index = content.indexOf("noindex");
if (index >= 0) {
metaTags.setNoIndex();
}
index = content.indexOf("nofollow");
if (index >= 0) {
metaTags.setNoFollow();
}
}
URL url = DOMContentUtils.getBase(node);
if (url != null) {
metaTags.setBaseHref(url);
}
}
}