Hi guys:
The urls of some files on the internet may contains Chinese or other
unicode words. For example
http://www.example.com/中文.pdf
But nutch can't encode it well. So I give this patch using URL using
URLEncoder to encode urls correctly.
罗磊
--- /home/luolei/workspace/apache-nutch-1.1/src/java/org/apache/nutch/fetcher/Fetcher.java 2010-03-30 16:35:49.000000000 +0800
+++ src/java/org/apache/nutch/fetcher/Fetcher.java 2011-01-10 22:21:39.959000051 +0800
@@ -50,6 +50,9 @@
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.*;
+import java.io.UnsupportedEncodingException;
+import java.net.URLEncoder;
+
/**
* A queue-based fetcher.
@@ -559,9 +562,9 @@
public void run() {
activeThreads.incrementAndGet(); // count threads
-
+
FetchItem fit = null;
- try {
+ try {
while (true) {
fit = fetchQueues.getFetchItem();
@@ -580,16 +583,44 @@
return;
}
}
+
+
+ System.out.println("URL Origin : " + fit.url.toString());
+
+
+ String utf8url = "";
+ try{
+ String fiturl = fit.url.toString();
+ int lastSlide = fiturl.lastIndexOf('/');
+ if (lastSlide == fiturl.length())
+ {
+ utf8url = fiturl;
+ }
+ else
+ {
+ utf8url = fiturl.substring(0, lastSlide+1) + URLEncoder.encode(fiturl.substring(lastSlide+1), "UTF-8");
+ }
+
+ }catch (UnsupportedEncodingException e) {
+ e.printStackTrace();
+ }
+ System.out.println("URL now : " + utf8url);
+
+ Text urlText = new Text(utf8url);
+
+
+
lastRequestStart.set(System.currentTimeMillis());
Text reprUrlWritable =
(Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
if (reprUrlWritable == null) {
- reprUrl = fit.url.toString();
+ reprUrl = urlText.toString();
} else {
reprUrl = reprUrlWritable.toString();
}
try {
- if (LOG.isInfoEnabled()) { LOG.info("fetching " + fit.url); }
+ if (LOG.isInfoEnabled()) { LOG.info("fetching " + urlText); }
// fetch the page
redirecting = false;
@@ -599,15 +630,15 @@
LOG.debug("redirectCount=" + redirectCount);
}
redirecting = false;
- Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString());
- RobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
+ Protocol protocol = this.protocolFactory.getProtocol(urlText.toString());
+ RobotRules rules = protocol.getRobotRules(urlText, fit.datum);
if (!rules.isAllowed(fit.u)) {
// unblock
fetchQueues.finishFetchItem(fit, true);
if (LOG.isDebugEnabled()) {
- LOG.debug("Denied by robots.txt: " + fit.url);
+ LOG.debug("Denied by robots.txt: " + urlText);
}
- output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
+ output(urlText, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
reporter.incrCounter("FetcherStatus", "robots_denied", 1);
continue;
}
@@ -615,8 +646,8 @@
if (rules.getCrawlDelay() > maxCrawlDelay) {
// unblock
fetchQueues.finishFetchItem(fit, true);
- LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
- output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
+ LOG.debug("Crawl-Delay for " + urlText + " too long (" + rules.getCrawlDelay() + "), skipping");
+ output(urlText, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1);
continue;
} else {
@@ -624,14 +655,14 @@
fiq.crawlDelay = rules.getCrawlDelay();
}
}
- ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum);
+ ProtocolOutput output = protocol.getProtocolOutput(urlText, fit.datum);
ProtocolStatus status = output.getStatus();
Content content = output.getContent();
ParseStatus pstatus = null;
// unblock queue
fetchQueues.finishFetchItem(fit);
- String urlString = fit.url.toString();
+ String urlString = urlText.toString();
reporter.incrCounter("FetcherStatus", status.getName(), 1);
@@ -643,14 +674,14 @@
break;
case ProtocolStatus.SUCCESS: // got a page
- pstatus = output(fit.url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS);
+ pstatus = output(urlText, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS);
updateStatus(content.getContent().length);
if (pstatus != null && pstatus.isSuccess() &&
pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
String newUrl = pstatus.getMessage();
int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
Text redirUrl =
- handleRedirect(fit.url, fit.datum,
+ handleRedirect(urlText, fit.datum,
urlString, newUrl,
refreshTime < Fetcher.PERM_REFRESH_TIME,
Fetcher.CONTENT_REDIR);
@@ -689,10 +720,10 @@
code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
temp = true;
}
- output(fit.url, fit.datum, content, status, code);
+ output(urlText, fit.datum, content, status, code);
String newUrl = status.getMessage();
Text redirUrl =
- handleRedirect(fit.url, fit.datum,
+ handleRedirect(urlText, fit.datum,
urlString, newUrl, temp,
Fetcher.PROTOCOL_REDIR);
if (redirUrl != null) {
@@ -722,40 +753,40 @@
break;
case ProtocolStatus.EXCEPTION:
- logError(fit.url, status.getMessage());
+ logError(urlText, status.getMessage());
int killedURLs = fetchQueues.checkExceptionThreshold(fit.getQueueID());
if (killedURLs!=0)
reporter.incrCounter("FetcherStatus", "AboveExceptionThresholdInQueue", killedURLs);
/* FALLTHROUGH */
case ProtocolStatus.RETRY: // retry
case ProtocolStatus.BLOCKED:
- output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
+ output(urlText, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
break;
case ProtocolStatus.GONE: // gone
case ProtocolStatus.NOTFOUND:
case ProtocolStatus.ACCESS_DENIED:
case ProtocolStatus.ROBOTS_DENIED:
- output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
+ output(urlText, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
break;
case ProtocolStatus.NOTMODIFIED:
- output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
+ output(urlText, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
break;
default:
if (LOG.isWarnEnabled()) {
LOG.warn("Unknown ProtocolStatus: " + status.getCode());
}
- output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
+ output(urlText, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
}
if (redirecting && redirectCount >= maxRedirect) {
fetchQueues.finishFetchItem(fit);
if (LOG.isInfoEnabled()) {
- LOG.info(" - redirect count exceeded " + fit.url);
+ LOG.info(" - redirect count exceeded " + urlText);
}
- output(fit.url, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, CrawlDatum.STATUS_FETCH_GONE);
+ output(urlText, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, CrawlDatum.STATUS_FETCH_GONE);
}
} while (redirecting && (redirectCount < maxRedirect));
@@ -763,8 +794,8 @@
} catch (Throwable t) { // unexpected exception
// unblock
fetchQueues.finishFetchItem(fit);
- logError(fit.url, t.toString());
- output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY);
+ logError(urlText, t.toString());
+ output(urlText, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY);
}
}