Author: cutting
Date: Tue Jun 7 14:50:51 2005
New Revision: 189452
URL: http://svn.apache.org/viewcvs?rev=189452&view=rev
Log:
Normalize & filter linked URLs prior to adding them.
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java?rev=189452&r1=189451&r2=189452&view=diff
==============================================================================
---
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java
(original)
+++
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java
Tue Jun 7 14:50:51 2005
@@ -22,6 +22,7 @@
import org.apache.nutch.util.*;
import org.apache.nutch.protocol.*;
import org.apache.nutch.parse.*;
+import org.apache.nutch.net.*;
import java.io.*;
import java.util.*;
@@ -36,6 +37,8 @@
private float interval;
+ private UrlNormalizer urlNormalizer = UrlNormalizerFactory.getNormalizer();
+
public ParseSegment() { super(null); }
public ParseSegment(NutchConf conf) {
@@ -93,7 +96,7 @@
UTF8.class, CrawlDatum.class);
return new RecordWriter() {
-
+
public void write(WritableComparable key, Writable value)
throws IOException {
@@ -105,9 +108,17 @@
// collect outlinks for subsequent db update
Outlink[] links = parse.getData().getOutlinks();
for (int i = 0; i < links.length; i++) {
- crawlOut.append(new UTF8(links[i].getToUrl()),
- new CrawlDatum(CrawlDatum.STATUS_LINKED,
- interval));
+ String toUrl = links[i].getToUrl();
+ try {
+ toUrl = urlNormalizer.normalize(toUrl); // normalize the url
+ toUrl = URLFilters.filter(toUrl); // filter the url
+ } catch (Exception e) {
+ toUrl = null;
+ }
+ if (toUrl != null)
+ crawlOut.append(new UTF8(toUrl),
+ new CrawlDatum(CrawlDatum.STATUS_LINKED,
+ interval));
}
}
-------------------------------------------------------
This SF.Net email is sponsored by: NEC IT Guy Games. How far can you shotput
a projector? How fast can you ride your desk chair down the office luge track?
If you want to score the big prize, get to know the little guy.
Play to win an NEC 61" plasma display: http://www.necitguy.com/?r
_______________________________________________
Nutch-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs