Doug Cutting wrote:
Hi,
Here's an updated patch, which cleans up the logic and adds some extensibility. Currently it supports the references in the following HTML elements: <a>, <area>, <frame> and <iframe>. It's easy to add more elements, but I couldn't think of any others...
Andrzej.
I just applied this patch. Thanks!
Doug
Andrzej Bialecki wrote:
Hi,
The attached patch makes a world of difference in my case. I'm trying to index some graphics-rich websites, and many links are hidden within image maps. This patch allows us to collect and traverse links contained in <area href="..."> elements.
Enjoy!
-------------------------------------------------------
This SF.Net email is sponsored by: Oracle 10g
Get certified on the hottest thing ever to hit the market... Oracle 10g. Take an Oracle 10g class now, and we'll give you the exam FREE.
http://ads.osdn.com/?ad_id=3149&alloc_id=8166&op=click
_______________________________________________
Nutch-developers mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-developers
-- Best regards, Andrzej Bialecki
------------------------------------------------- Software Architect, System Integration Specialist CEN/ISSS EC Workshop, ECIMF project chair EU FP6 E-Commerce Expert/Evaluator ------------------------------------------------- FreeBSD developer (http://www.freebsd.org)
Index: DOMContentUtils.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/util/DOMContentUtils.java,v
retrieving revision 1.4
diff -b -d -u -r1.4 DOMContentUtils.java
--- DOMContentUtils.java 20 May 2004 19:19:29 -0000 1.4
+++ DOMContentUtils.java 21 May 2004 22:02:47 -0000
@@ -3,6 +3,7 @@
import java.net.URL;
import java.net.MalformedURLException;
import java.util.ArrayList;
+import java.util.HashMap;
import net.nutch.fetcher.Outlink;
@@ -17,6 +18,25 @@
*/
public class DOMContentUtils {
+ public static class LinkParams {
+ public String attrName;
+ public int childLen;
+
+ public LinkParams(String attrName, int childLen) {
+ this.attrName = attrName;
+ this.childLen = childLen;
+ }
+ }
+
+ public static HashMap linkParams = new HashMap();
+
+ static {
+ linkParams.put("a", new LinkParams("href", 1));
+ linkParams.put("area", new LinkParams("href", 0));
+ linkParams.put("frame", new LinkParams("src", 0));
+ linkParams.put("iframe", new LinkParams("src", 0));
+ }
+
/**
* This method takes a [EMAIL PROTECTED] StringBuffer} and a DOM [EMAIL PROTECTED]
Node},
* and will append all the content text found beneath the DOM node to
@@ -128,13 +148,11 @@
// this only covers a few cases of empty links that are symptomatic
// of nekohtml's DOM-fixup process...
private static boolean shouldThrowAwayLink(Node node, NodeList children,
- int childLen) {
- if (node.getNodeName().equalsIgnoreCase("area")) {
- return false;
- }
+ int childLen, LinkParams params) {
if (childLen == 0) {
// this has no inner structure
- return true;
+ if (params.childLen == 0) return false;
+ else return true;
} else if ((childLen == 1)
&& (children.item(0).getNodeType() == Node.ELEMENT_NODE)
&& ("a".equalsIgnoreCase(children.item(0).getNodeName()))) {
@@ -204,10 +222,10 @@
childLen= children.getLength();
if (node.getNodeType() == Node.ELEMENT_NODE) {
- if ("a".equalsIgnoreCase(node.getNodeName()) ||
- "area".equalsIgnoreCase(node.getNodeName())) {
+ LinkParams params =
(LinkParams)linkParams.get(node.getNodeName().toLowerCase());
+ if (params != null) {
- if (shouldThrowAwayLink(node, children, childLen)) {
+ if (shouldThrowAwayLink(node, children, childLen, params)) {
// this has no inner structure or just a single nested
// anchor-- toss it!
} else {
@@ -218,7 +236,7 @@
NamedNodeMap attrs = node.getAttributes();
String target= null;
for (int i= 0; i < attrs.getLength(); i++ ) {
- if ("href".equalsIgnoreCase(attrs.item(i).getNodeName())) {
+ if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) {
target= attrs.item(i).getNodeValue();
break;
}
