Author: cutting
Date: Tue Jun  7 14:50:51 2005
New Revision: 189452

URL: http://svn.apache.org/viewcvs?rev=189452&view=rev
Log:
Normalize & filter linked URLs prior to adding them.

Modified:
    
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java?rev=189452&r1=189451&r2=189452&view=diff
==============================================================================
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java 
Tue Jun  7 14:50:51 2005
@@ -22,6 +22,7 @@
 import org.apache.nutch.util.*;
 import org.apache.nutch.protocol.*;
 import org.apache.nutch.parse.*;
+import org.apache.nutch.net.*;
 
 import java.io.*;
 import java.util.*;
@@ -36,6 +37,8 @@
 
   private float interval;
 
+  private UrlNormalizer urlNormalizer = UrlNormalizerFactory.getNormalizer();
+        
   public ParseSegment() { super(null); }
 
   public ParseSegment(NutchConf conf) {
@@ -93,7 +96,7 @@
                               UTF8.class, CrawlDatum.class);
     
     return new RecordWriter() {
-        
+
         public void write(WritableComparable key, Writable value)
           throws IOException {
           
@@ -105,9 +108,17 @@
           // collect outlinks for subsequent db update
           Outlink[] links = parse.getData().getOutlinks();
           for (int i = 0; i < links.length; i++) {
-            crawlOut.append(new UTF8(links[i].getToUrl()),
-                            new CrawlDatum(CrawlDatum.STATUS_LINKED,
-                                           interval));
+            String toUrl = links[i].getToUrl();
+            try {
+              toUrl = urlNormalizer.normalize(toUrl); // normalize the url
+              toUrl = URLFilters.filter(toUrl);   // filter the url
+            } catch (Exception e) {
+              toUrl = null;
+            }
+            if (toUrl != null)
+              crawlOut.append(new UTF8(toUrl),
+                              new CrawlDatum(CrawlDatum.STATUS_LINKED,
+                                             interval));
           }
         }
         




-------------------------------------------------------
This SF.Net email is sponsored by: NEC IT Guy Games.  How far can you shotput
a projector? How fast can you ride your desk chair down the office luge track?
If you want to score the big prize, get to know the little guy.
Play to win an NEC 61" plasma display: http://www.necitguy.com/?r 
_______________________________________________
Nutch-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to