Author: ab
Date: Mon May 16 07:23:53 2005
New Revision: 170390
URL: http://svn.apache.org/viewcvs?rev=170390&view=rev
Log:
Extract links from other HTML elements, too.
Modified:
incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
Modified:
incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL:
http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=170390&r1=170389&r2=170390&view=diff
==============================================================================
---
incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
(original)
+++
incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
Mon May 16 07:23:53 2005
@@ -55,8 +55,12 @@
static {
linkParams.put("a", new LinkParams("a", "href", 1));
linkParams.put("area", new LinkParams("area", "href", 0));
+ linkParams.put("form", new LinkParams("form", "action", 1));
linkParams.put("frame", new LinkParams("frame", "src", 0));
linkParams.put("iframe", new LinkParams("iframe", "src", 0));
+ linkParams.put("script", new LinkParams("script", "src", 0));
+ linkParams.put("link", new LinkParams("link", "href", 0));
+ linkParams.put("img", new LinkParams("img", "src", 0));
}
/**
@@ -72,8 +76,6 @@
*
* <p>
*
- * Currently, only SCRIPT, STYLE and comment text are ignored.
- *
* @return true if nested anchors were found
*/
public static final boolean getText(StringBuffer sb, Node node,
@@ -100,9 +102,25 @@
boolean abortOnNestedAnchors,
int anchorDepth) {
if ("script".equalsIgnoreCase(node.getNodeName())) {
+ Node n = node.getAttributes().getNamedItem("language");
+ if (n != null) {
+ String text = n.getNodeValue();
+ sb.append(text);
+ }
return false;
}
if ("style".equalsIgnoreCase(node.getNodeName())) {
+ Node n = node.getAttributes().getNamedItem("rel");
+ if (n != null) {
+ String text = n.getNodeValue();
+ sb.append(text);
+ }
+ n = node.getAttributes().getNamedItem("type");
+ if (n != null) {
+ String text = n.getNodeValue();
+ if (sb.length() > 0) sb.append(", ");
+ sb.append(text);
+ }
return false;
}
if (abortOnNestedAnchors && "a".equalsIgnoreCase(node.getNodeName())) {
-------------------------------------------------------
This SF.Net email is sponsored by Oracle Space Sweepstakes
Want to be the first software developer in space?
Enter now for the Oracle Space Sweepstakes!
http://ads.osdn.com/?ad_idt12&alloc_id344&op=click
_______________________________________________
Nutch-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs