Author: ab
Date: Mon May 16 07:23:53 2005
New Revision: 170390

URL: http://svn.apache.org/viewcvs?rev=170390&view=rev
Log:
Extract links from other HTML elements, too.

Modified:
    
incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java

Modified: 
incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: 
http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=170390&r1=170389&r2=170390&view=diff
==============================================================================
--- 
incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 (original)
+++ 
incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 Mon May 16 07:23:53 2005
@@ -55,8 +55,12 @@
   static {
       linkParams.put("a", new LinkParams("a", "href", 1));
       linkParams.put("area", new LinkParams("area", "href", 0));
+      linkParams.put("form", new LinkParams("form", "action", 1));
       linkParams.put("frame", new LinkParams("frame", "src", 0));
       linkParams.put("iframe", new LinkParams("iframe", "src", 0));
+      linkParams.put("script", new LinkParams("script", "src", 0));
+      linkParams.put("link", new LinkParams("link", "href", 0));
+      linkParams.put("img", new LinkParams("img", "src", 0));
   }
   
   /**
@@ -72,8 +76,6 @@
    * 
    * <p>
    *
-   * Currently, only SCRIPT, STYLE and comment text are ignored.
-   *
    * @return true if nested anchors were found
    */
   public static final boolean getText(StringBuffer sb, Node node, 
@@ -100,9 +102,25 @@
                                              boolean abortOnNestedAnchors,
                                              int anchorDepth) {
     if ("script".equalsIgnoreCase(node.getNodeName())) {
+      Node n = node.getAttributes().getNamedItem("language");
+      if (n != null) {
+        String text = n.getNodeValue();
+        sb.append(text);
+      }
       return false;
     }
     if ("style".equalsIgnoreCase(node.getNodeName())) {
+      Node n = node.getAttributes().getNamedItem("rel");
+      if (n != null) {
+        String text = n.getNodeValue();
+        sb.append(text);
+      }
+      n = node.getAttributes().getNamedItem("type");
+      if (n != null) {
+        String text = n.getNodeValue();
+        if (sb.length() > 0) sb.append(", ");
+        sb.append(text);
+      }
       return false;
     }
     if (abortOnNestedAnchors && "a".equalsIgnoreCase(node.getNodeName())) {




-------------------------------------------------------
This SF.Net email is sponsored by Oracle Space Sweepstakes
Want to be the first software developer in space?
Enter now for the Oracle Space Sweepstakes!
http://ads.osdn.com/?ad_idt12&alloc_id344&op=click
_______________________________________________
Nutch-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to