noindex tags now working!
Thank you Andrzej, I appreciated your original pointer and got me to
solve this through my own diligence which is always more rewarding. You
are a credit to this board.
complete modified DomContentUtils.java file as follows for those who ask
the same questions I have asked:
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.html;
import java.net.URL;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import org.apache.nutch.parse.Outlink;
import org.apache.hadoop.conf.Configuration;
import org.w3c.dom.*;
/**
* A collection of methods for extracting content from DOM trees.
*
* This class holds a few utility methods for pulling content out of
* DOM nodes, such as getOutlinks, getText, etc.
*
*/
public class DOMContentUtils {
private static boolean noindex = false;
public String elName;
public static class LinkParams {
public String elName;
public String attrName;
public int childLen;
public LinkParams(String elName, String attrName, int childLen) {
this.elName = elName;
this.attrName = attrName;
this.childLen = childLen;
}
public String toString() {
return "LP[el=" + elName + ",attr=" + attrName + ",len=" +
childLen + "]";
}
}
private HashMap linkParams = new HashMap();
private Configuration conf;
public DOMContentUtils(Configuration conf) {
setConf(conf);
}
public void setConf(Configuration conf) {
this.conf = conf;
linkParams.clear();
linkParams.put("a", new LinkParams("a", "href", 1));
linkParams.put("area", new LinkParams("area", "href", 0));
if (conf.getBoolean("parser.html.form.use_action", false)) {
linkParams.put("form", new LinkParams("form", "action", 1));
}
linkParams.put("frame", new LinkParams("frame", "src", 0));
linkParams.put("iframe", new LinkParams("iframe", "src", 0));
linkParams.put("script", new LinkParams("script", "src", 0));
linkParams.put("link", new LinkParams("link", "href", 0));
linkParams.put("img", new LinkParams("img", "src", 0));
}
/**
* This method takes a [EMAIL PROTECTED] StringBuffer} and a DOM [EMAIL
PROTECTED] Node},
* and will append all the content text found beneath the DOM node to
* the <code>StringBuffer</code>.
*
* <p>
*
* If <code>abortOnNestedAnchors</code> is true, DOM traversal will
* be aborted and the <code>StringBuffer</code> will not contain
* any text encountered after a nested anchor is found.
*
* <p>
*
* @return true if nested anchors were found
*/
public boolean getText(StringBuffer sb, Node node,
boolean abortOnNestedAnchors) {
if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
return true;
}
return false;
}
/**
* This is a convinience method, equivalent to [EMAIL PROTECTED]
* #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
*
*/
public void getText(StringBuffer sb, Node node) {
getText(sb, node, false);
}
// returns true if abortOnNestedAnchors is true and we find nested
// anchors
private boolean getTextHelper(StringBuffer sb, Node node,
boolean
abortOnNestedAnchors,
int anchorDepth) {
if ("script".equalsIgnoreCase(node.getNodeName())) {
return false;
}
if ("style".equalsIgnoreCase(node.getNodeName())) {
return false;
}
if (abortOnNestedAnchors &&
"a".equalsIgnoreCase(node.getNodeName())) {
anchorDepth++;
if (anchorDepth > 1)
return true;
}
if (node.getNodeType() == Node.COMMENT_NODE) {
String text = node.getNodeValue();
if (text.equals("noindex")){
noindex = true;
}
if (text.equals("/noindex")){
noindex = false;
}
return false;
}
if (node.getNodeType() == Node.TEXT_NODE) {
// cleanup and trim the value
String text = node.getNodeValue();
text = text.replaceAll("\\s+", " ");
text = text.trim();
if (text.length() > 0 && noindex == false) {
if (sb.length() > 0) sb.append(' ');
sb.append(text);
}
}
boolean abort = false;
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
if (getTextHelper(sb, children.item(i),
abortOnNestedAnchors, anchorDepth)) {
abort = true;
break;
}
}
}
return abort;
}
/**
* This method takes a [EMAIL PROTECTED] StringBuffer} and a DOM [EMAIL
PROTECTED] Node},
* and will append the content text found beneath the first
* <code>title</code> node to the <code>StringBuffer</code>.
*
* @return true if a title node was found, false otherwise
*/
public boolean getTitle(StringBuffer sb, Node node) {
if ("body".equalsIgnoreCase(node.getNodeName())) // stop after HEAD
return false;
if (node.getNodeType() == Node.ELEMENT_NODE) {
if ("title".equalsIgnoreCase(node.getNodeName())) {
getText(sb, node);
return true;
}
}
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
if (getTitle(sb, children.item(i))) {
return true;
}
}
}
return false;
}
/** If Node contains a BASE tag then it's HREF is returned. */
public URL getBase(Node node) {
// is this node a BASE tag?
if (node.getNodeType() == Node.ELEMENT_NODE) {
if ("body".equalsIgnoreCase(node.getNodeName())) // stop after
HEAD
return null;
if ("base".equalsIgnoreCase(node.getNodeName())) {
NamedNodeMap attrs = node.getAttributes();
for (int i= 0; i < attrs.getLength(); i++ ) {
Node attr = attrs.item(i);
if ("href".equalsIgnoreCase(attr.getNodeName())) {
try {
return new URL(attr.getNodeValue());
} catch (MalformedURLException e) {}
}
}
}
}
// does it contain a base tag?
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
URL base = getBase(children.item(i));
if (base != null)
return base;
}
}
// no.
return null;
}
private boolean hasOnlyWhiteSpace(Node node) {
String val= node.getNodeValue();
for (int i= 0; i < val.length(); i++) {
if (!Character.isWhitespace(val.charAt(i)))
return false;
}
return true;
}
// this only covers a few cases of empty links that are symptomatic
// of nekohtml's DOM-fixup process...
private boolean shouldThrowAwayLink(Node node, NodeList children,
int childLen, LinkParams
params) {
if (childLen == 0) {
// this has no inner structure
if (params.childLen == 0) return false;
else return true;
} else if ((childLen == 1)
&& (children.item(0).getNodeType() == Node.ELEMENT_NODE)
&&
(params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
// single nested link
return true;
} else if (childLen == 2) {
Node c0= children.item(0);
Node c1= children.item(1);
if ((c0.getNodeType() == Node.ELEMENT_NODE)
&& (params.elName.equalsIgnoreCase(c0.getNodeName()))
&& (c1.getNodeType() == Node.TEXT_NODE)
&& hasOnlyWhiteSpace(c1) ) {
// single link followed by whitespace node
return true;
}
if ((c1.getNodeType() == Node.ELEMENT_NODE)
&& (params.elName.equalsIgnoreCase(c1.getNodeName()))
&& (c0.getNodeType() == Node.TEXT_NODE)
&& hasOnlyWhiteSpace(c0) ) {
// whitespace node followed by single link
return true;
}
} else if (childLen == 3) {
Node c0= children.item(0);
Node c1= children.item(1);
Node c2= children.item(2);
if ((c1.getNodeType() == Node.ELEMENT_NODE)
&& (params.elName.equalsIgnoreCase(c1.getNodeName()))
&& (c0.getNodeType() == Node.TEXT_NODE)
&& (c2.getNodeType() == Node.TEXT_NODE)
&& hasOnlyWhiteSpace(c0)
&& hasOnlyWhiteSpace(c2) ) {
// single link surrounded by whitespace nodes
return true;
}
}
return false;
}
/**
* This method finds all anchors below the supplied DOM
* <code>node</code>, and creates appropriate [EMAIL PROTECTED] Outlink}
* records for each (relative to the supplied <code>base</code>
* URL), and adds them to the <code>outlinks</code> [EMAIL PROTECTED]
* ArrayList}.
*
* <p>
*
* Links without inner structure (tags, text, etc) are discarded, as
* are links which contain only single nested links and empty text
* nodes (this is a common DOM-fixup artifact, at least with
* nekohtml).
*/
public void getOutlinks(URL base, ArrayList outlinks,
Node node) {
NodeList children = node.getChildNodes();
int childLen= 0;
if (children != null)
childLen= children.getLength();
if (node.getNodeType() == Node.ELEMENT_NODE) {
String nodeName = node.getNodeName().toLowerCase();
LinkParams params = (LinkParams)linkParams.get(nodeName);
if (params != null) {
if (!shouldThrowAwayLink(node, children, childLen, params)) {
StringBuffer linkText = new StringBuffer();
getText(linkText, node, true);
NamedNodeMap attrs = node.getAttributes();
String target = null;
boolean noFollow = false;
boolean post = false;
for (int i= 0; i < attrs.getLength(); i++ ) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName();
if (params.attrName.equalsIgnoreCase(attrName)) {
target = attr.getNodeValue();
} else if ("rel".equalsIgnoreCase(attrName) &&
"nofollow".equalsIgnoreCase(attr.getNodeValue()))
{
noFollow = true;
} else if ("method".equalsIgnoreCase(attrName) &&
"post".equalsIgnoreCase(attr.getNodeValue())) {
post = true;
}
}
if (target != null && !noFollow && !post)
try {
URL url = new URL(base, target);
outlinks.add(new Outlink(url.toString(),
linkText.toString().trim(),
conf));
} catch (MalformedURLException e) {
// don't care
}
}
// this should not have any children, skip them
if (params.childLen == 0) return;
}
}
for ( int i = 0; i < childLen; i++ ) {
getOutlinks(base, outlinks, children.item(i));
}
}
}
-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
Nutch-general mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-general