Author: ferdy
Date: Fri Aug 31 13:02:32 2012
New Revision: 1379438
URL: http://svn.apache.org/viewvc?rev=1379438&view=rev
Log:
NUTCH-1448 Redirected urls should be handled more cleanly (more like an outlink
url)
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1379438&r1=1379437&r2=1379438&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Aug 31 13:02:32 2012
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.1 - Current Development
+* NUTCH-1448 Redirected urls should be handled more cleanly (more like an
outlink url) (ferdy)
+
* NUTCH-1463 Elasticsearch indexer should wait and check response for last
flush (ferdy)
* NUTCH-1462 Elasticsearch not indexing when type==null in NutchDocument
metadata (ferdy)
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1379438&r1=1379437&r2=1379438&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Fri Aug 31 13:02:32 2012
@@ -225,15 +225,6 @@
</property>
<property>
- <name>http.redirect.max</name>
- <value>0</value>
- <description>The maximum number of redirects the fetcher will follow when
- trying to fetch a page. If set to negative or 0, fetcher won't immediately
- follow redirected URLs, instead it will record them for later fetching.
- </description>
-</property>
-
-<property>
<name>http.useHttp11</name>
<value>false</value>
<description>NOTE: at the moment this works only for protocol-httpclient.
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java?rev=1379438&r1=1379437&r2=1379438&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java Fri
Aug 31 13:02:32 2012
@@ -105,58 +105,45 @@ extends GoraReducer<UrlWithScore, NutchW
page.setScore(0.0f);
}
} else {
- if (page.getMetadata().containsKey(FetcherJob.REDIRECT_DISCOVERED)
- && !page.isReadable(WebPage.Field.STATUS.getIndex())) {
- // this row is marked during fetch as the destination of a redirect
- // but does not contain anything else, so we initialize it.
- page.setStatus(CrawlStatus.STATUS_UNFETCHED);
- schedule.initializeSchedule(url, page);
- try {
- scoringFilters.initialScore(url, page);
- } catch (ScoringFilterException e) {
- page.setScore(0.0f);
+ byte status = (byte)page.getStatus();
+ switch (status) {
+ case CrawlStatus.STATUS_FETCHED: // succesful fetch
+ case CrawlStatus.STATUS_REDIR_TEMP: // successful fetch, redirected
+ case CrawlStatus.STATUS_REDIR_PERM:
+ case CrawlStatus.STATUS_NOTMODIFIED: // successful fetch, notmodified
+ int modified = FetchSchedule.STATUS_UNKNOWN;
+ if (status == CrawlStatus.STATUS_NOTMODIFIED) {
+ modified = FetchSchedule.STATUS_NOTMODIFIED;
}
- } else { // update row
- byte status = (byte)page.getStatus();
- switch (status) {
- case CrawlStatus.STATUS_FETCHED: // succesful fetch
- case CrawlStatus.STATUS_REDIR_TEMP: // successful fetch,
redirected
- case CrawlStatus.STATUS_REDIR_PERM:
- case CrawlStatus.STATUS_NOTMODIFIED: // successful fetch,
notmodified
- int modified = FetchSchedule.STATUS_UNKNOWN;
- if (status == CrawlStatus.STATUS_NOTMODIFIED) {
- modified = FetchSchedule.STATUS_NOTMODIFIED;
- }
- ByteBuffer prevSig = page.getPrevSignature();
- ByteBuffer signature = page.getSignature();
- if (prevSig != null && signature != null) {
- if (SignatureComparator.compare(prevSig.array(),
signature.array()) != 0) {
- modified = FetchSchedule.STATUS_MODIFIED;
- } else {
- modified = FetchSchedule.STATUS_NOTMODIFIED;
- }
- }
- long fetchTime = page.getFetchTime();
- long prevFetchTime = page.getPrevFetchTime();
- long modifiedTime = page.getModifiedTime();
-
- schedule.setFetchSchedule(url, page, prevFetchTime, 0L,
- fetchTime, modifiedTime, modified);
- if (maxInterval < page.getFetchInterval())
- schedule.forceRefetch(url, page, false);
- break;
- case CrawlStatus.STATUS_RETRY:
- schedule.setPageRetrySchedule(url, page, 0L, 0L,
page.getFetchTime());
- if (page.getRetriesSinceFetch() < retryMax) {
- page.setStatus(CrawlStatus.STATUS_UNFETCHED);
+ ByteBuffer prevSig = page.getPrevSignature();
+ ByteBuffer signature = page.getSignature();
+ if (prevSig != null && signature != null) {
+ if (SignatureComparator.compare(prevSig.array(), signature.array())
!= 0) {
+ modified = FetchSchedule.STATUS_MODIFIED;
} else {
- page.setStatus(CrawlStatus.STATUS_GONE);
+ modified = FetchSchedule.STATUS_NOTMODIFIED;
}
- break;
- case CrawlStatus.STATUS_GONE:
- schedule.setPageGoneSchedule(url, page, 0L, 0L, page.getFetchTime());
- break;
}
+ long fetchTime = page.getFetchTime();
+ long prevFetchTime = page.getPrevFetchTime();
+ long modifiedTime = page.getModifiedTime();
+
+ schedule.setFetchSchedule(url, page, prevFetchTime, 0L,
+ fetchTime, modifiedTime, modified);
+ if (maxInterval < page.getFetchInterval())
+ schedule.forceRefetch(url, page, false);
+ break;
+ case CrawlStatus.STATUS_RETRY:
+ schedule.setPageRetrySchedule(url, page, 0L, 0L, page.getFetchTime());
+ if (page.getRetriesSinceFetch() < retryMax) {
+ page.setStatus(CrawlStatus.STATUS_UNFETCHED);
+ } else {
+ page.setStatus(CrawlStatus.STATUS_GONE);
+ }
+ break;
+ case CrawlStatus.STATUS_GONE:
+ schedule.setPageGoneSchedule(url, page, 0L, 0L, page.getFetchTime());
+ break;
}
}
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1379438&r1=1379437&r2=1379438&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
Fri Aug 31 13:02:32 2012
@@ -431,10 +431,8 @@ extends GoraReducer<IntWritable, FetchEn
private final long maxCrawlDelay;
@SuppressWarnings("unused")
private final boolean byIP;
- private final int maxRedirect;
private String reprUrl;
- private boolean redirecting;
- private int redirectCount;
+
private final Context context;
public FetcherThread(Context context, int num) {
@@ -448,7 +446,6 @@ extends GoraReducer<IntWritable, FetchEn
this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
// backward-compatible default setting
this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true);
- this.maxRedirect = conf.getInt("http.redirect.max", 3);
}
@Override
@@ -487,110 +484,93 @@ extends GoraReducer<IntWritable, FetchEn
LOG.info("fetching " + fit.url);
// fetch the page
- redirecting = false;
- redirectCount = 0;
- do {
+ final Protocol protocol =
this.protocolFactory.getProtocol(fit.url);
+ final RobotRules rules = protocol.getRobotRules(fit.url, fit.page);
+ if (!rules.isAllowed(fit.u)) {
+ // unblock
+ fetchQueues.finishFetchItem(fit, true);
if (LOG.isDebugEnabled()) {
- LOG.debug("redirectCount=" + redirectCount);
+ LOG.debug("Denied by robots.txt: " + fit.url);
}
- redirecting = false;
- final Protocol protocol =
this.protocolFactory.getProtocol(fit.url);
- final RobotRules rules = protocol.getRobotRules(fit.url,
fit.page);
- if (!rules.isAllowed(fit.u)) {
+ output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED,
+ CrawlStatus.STATUS_GONE);
+ continue;
+ }
+ if (rules.getCrawlDelay() > 0) {
+ if (rules.getCrawlDelay() > maxCrawlDelay) {
// unblock
fetchQueues.finishFetchItem(fit, true);
- if (LOG.isDebugEnabled()) {
- LOG.debug("Denied by robots.txt: " + fit.url);
- }
- output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED,
- CrawlStatus.STATUS_GONE);
+ LOG.debug("Crawl-Delay for " + fit.url + " too long (" +
rules.getCrawlDelay() + "), skipping");
+ output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED,
CrawlStatus.STATUS_GONE);
continue;
+ } else {
+ final FetchItemQueue fiq =
fetchQueues.getFetchItemQueue(fit.queueID);
+ fiq.crawlDelay = rules.getCrawlDelay();
}
- if (rules.getCrawlDelay() > 0) {
- if (rules.getCrawlDelay() > maxCrawlDelay) {
- // unblock
- fetchQueues.finishFetchItem(fit, true);
- LOG.debug("Crawl-Delay for " + fit.url + " too long (" +
rules.getCrawlDelay() + "), skipping");
- output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED,
CrawlStatus.STATUS_GONE);
- continue;
- } else {
- final FetchItemQueue fiq =
fetchQueues.getFetchItemQueue(fit.queueID);
- fiq.crawlDelay = rules.getCrawlDelay();
- }
- }
- final ProtocolOutput output =
protocol.getProtocolOutput(fit.url, fit.page);
- final ProtocolStatus status = output.getStatus();
- final Content content = output.getContent();
- // unblock queue
- fetchQueues.finishFetchItem(fit);
-
- context.getCounter("FetcherStatus",
ProtocolStatusUtils.getName(status.getCode())).increment(1);
-
- int length = 0;
- if (content!=null && content.getContent()!=null) length=
content.getContent().length;
- updateStatus(length);
-
- switch(status.getCode()) {
-
- case ProtocolStatusCodes.WOULDBLOCK:
- // retry ?
- fetchQueues.addFetchItem(fit);
- break;
-
- case ProtocolStatusCodes.SUCCESS: // got a page
- output(fit, content, status, CrawlStatus.STATUS_FETCHED);
- break;
-
- case ProtocolStatusCodes.MOVED: // redirect
- case ProtocolStatusCodes.TEMP_MOVED:
- byte code;
- boolean temp;
- if (status.getCode() == ProtocolStatusCodes.MOVED) {
- code = CrawlStatus.STATUS_REDIR_PERM;
- temp = false;
- } else {
- code = CrawlStatus.STATUS_REDIR_TEMP;
- temp = true;
- }
- output(fit, content, status, code);
- final String newUrl = ProtocolStatusUtils.getMessage(status);
- handleRedirect(fit.url, newUrl, temp,
FetcherJob.PROTOCOL_REDIR);
- redirecting = false;
- break;
- case ProtocolStatusCodes.EXCEPTION:
- logError(fit.url, ProtocolStatusUtils.getMessage(status));
- /* FALLTHROUGH */
- case ProtocolStatusCodes.RETRY: // retry
- case ProtocolStatusCodes.BLOCKED:
- output(fit, null, status, CrawlStatus.STATUS_RETRY);
- break;
-
- case ProtocolStatusCodes.GONE: // gone
- case ProtocolStatusCodes.NOTFOUND:
- case ProtocolStatusCodes.ACCESS_DENIED:
- case ProtocolStatusCodes.ROBOTS_DENIED:
- output(fit, null, status, CrawlStatus.STATUS_GONE);
- break;
-
- case ProtocolStatusCodes.NOTMODIFIED:
- output(fit, null, status, CrawlStatus.STATUS_NOTMODIFIED);
- break;
-
- default:
- if (LOG.isWarnEnabled()) {
- LOG.warn("Unknown ProtocolStatus: " + status.getCode());
- }
- output(fit, null, status, CrawlStatus.STATUS_RETRY);
- }
+ }
+ final ProtocolOutput output = protocol.getProtocolOutput(fit.url,
fit.page);
+ final ProtocolStatus status = output.getStatus();
+ final Content content = output.getContent();
+ // unblock queue
+ fetchQueues.finishFetchItem(fit);
- if (redirecting && redirectCount > maxRedirect) {
- fetchQueues.finishFetchItem(fit);
- LOG.info(" - redirect count exceeded " + fit.url);
- output(fit, null, ProtocolStatusUtils.STATUS_REDIR_EXCEEDED,
- CrawlStatus.STATUS_GONE);
- }
+ context.getCounter("FetcherStatus",
ProtocolStatusUtils.getName(status.getCode())).increment(1);
- } while (redirecting && (redirectCount <= maxRedirect));
+ int length = 0;
+ if (content!=null && content.getContent()!=null) length=
content.getContent().length;
+ updateStatus(length);
+
+ switch(status.getCode()) {
+
+ case ProtocolStatusCodes.WOULDBLOCK:
+ // retry ?
+ fetchQueues.addFetchItem(fit);
+ break;
+
+ case ProtocolStatusCodes.SUCCESS: // got a page
+ output(fit, content, status, CrawlStatus.STATUS_FETCHED);
+ break;
+
+ case ProtocolStatusCodes.MOVED: // redirect
+ case ProtocolStatusCodes.TEMP_MOVED:
+ byte code;
+ boolean temp;
+ if (status.getCode() == ProtocolStatusCodes.MOVED) {
+ code = CrawlStatus.STATUS_REDIR_PERM;
+ temp = false;
+ } else {
+ code = CrawlStatus.STATUS_REDIR_TEMP;
+ temp = true;
+ }
+ final String newUrl = ProtocolStatusUtils.getMessage(status);
+ handleRedirect(fit.url, newUrl, temp,
FetcherJob.PROTOCOL_REDIR, fit.page);
+ output(fit, content, status, code);
+ break;
+ case ProtocolStatusCodes.EXCEPTION:
+ logFetchFailure(fit.url, ProtocolStatusUtils.getMessage(status));
+ /* FALLTHROUGH */
+ case ProtocolStatusCodes.RETRY: // retry
+ case ProtocolStatusCodes.BLOCKED:
+ output(fit, null, status, CrawlStatus.STATUS_RETRY);
+ break;
+
+ case ProtocolStatusCodes.GONE: // gone
+ case ProtocolStatusCodes.NOTFOUND:
+ case ProtocolStatusCodes.ACCESS_DENIED:
+ case ProtocolStatusCodes.ROBOTS_DENIED:
+ output(fit, null, status, CrawlStatus.STATUS_GONE);
+ break;
+
+ case ProtocolStatusCodes.NOTMODIFIED:
+ output(fit, null, status, CrawlStatus.STATUS_NOTMODIFIED);
+ break;
+
+ default:
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Unknown ProtocolStatus: " + status.getCode());
+ }
+ output(fit, null, status, CrawlStatus.STATUS_RETRY);
+ }
} catch (final Throwable t) { // unexpected exception
// unblock
@@ -611,27 +591,27 @@ extends GoraReducer<IntWritable, FetchEn
}
private void handleRedirect(String url, String newUrl,
- boolean temp, String redirType)
+ boolean temp, String redirType, WebPage page)
throws URLFilterException, IOException, InterruptedException {
newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
newUrl = urlFilters.filter(newUrl);
if (newUrl == null || newUrl.equals(url)) {
return;
}
+ page.putToOutlinks(new Utf8(newUrl), new Utf8());
+ page.putToMetadata(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL);
reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
- final String reversedNewUrl = TableUtil.reverseUrl(newUrl);
- WebPage newWebPage = new WebPage();
- if (!reprUrl.equals(url)) {
- newWebPage.setReprUrl(new Utf8(reprUrl));
- }
- newWebPage.putToMetadata(FetcherJob.REDIRECT_DISCOVERED,
TableUtil.YES_VAL);
- context.write(reversedNewUrl, newWebPage);
- if (LOG.isDebugEnabled()) {
- LOG.debug(" - " + redirType + " redirect to " +
- reprUrl + " (fetching later)");
+ if (reprUrl == null) {
+ LOG.warn("reprUrl==null");
+ } else {
+ page.setReprUrl(new Utf8(reprUrl));
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(" - " + redirType + " redirect to " +
+ reprUrl + " (fetching later)");
+ }
}
-
}
+
private void updateStatus(int bytesInPage) throws IOException {
pages.incrementAndGet();
@@ -659,11 +639,7 @@ extends GoraReducer<IntWritable, FetchEn
if (parse) {
if (!skipTruncated || (skipTruncated &&
!ParserJob.isTruncated(fit.url, fit.page))) {
- URLWebPage redirectedPage = parseUtil.process(key, fit.page);
- if (redirectedPage != null) {
- context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
- redirectedPage.getDatum());
- }
+ parseUtil.process(key, fit.page);
}
}
//remove content if storingContent is false. Content is added to
fit.page above
@@ -674,12 +650,13 @@ extends GoraReducer<IntWritable, FetchEn
context.write(key, fit.page);
}
- private void logError(String url, String message) {
- LOG.info("fetch of " + url + " failed with: " + message);
+ private void logFetchFailure(String url, String message) {
+ LOG.warn("fetch of " + url + " failed with: " + message);
errors.incrementAndGet();
}
}
+
/**
* This class feeds the queues with input items, and re-fills them as
* items are consumed by FetcherThread-s.
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1379438&r1=1379437&r2=1379438&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Fri Aug
31 13:02:32 2012
@@ -156,21 +156,19 @@ public class ParseUtil extends Configure
}
/**
- * Parses given web page and stores parsed content within page. Returns
- * a pair of <String, WebPage> if a meta-redirect is discovered
+ * Parses given web page and stores parsed content within page. Puts
+ * a meta-redirect to outlinks.
* @param key
* @param page
- * @return newly-discovered webpage (via a meta-redirect)
*/
- public URLWebPage process(String key, WebPage page) {
- URLWebPage redirectedPage = null;
+ public void process(String key, WebPage page) {
String url = TableUtil.unreverseUrl(key);
byte status = (byte) page.getStatus();
if (status != CrawlStatus.STATUS_FETCHED) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping " + url + " as status is: " +
CrawlStatus.getName(status));
}
- return redirectedPage;
+ return;
}
Parse parse;
@@ -179,14 +177,15 @@ public class ParseUtil extends Configure
} catch (ParserNotFound e) {
// do not print stacktrace for the fact that some types are not mapped.
LOG.warn("No suitable parser found: " + e.getMessage());
- return redirectedPage;
+ return;
} catch (final Exception e) {
- LOG.warn("Error parsing: " + url + ": " +
StringUtils.stringifyException(e));
- return redirectedPage;
+ LOG.warn("Error parsing: " + url + ": "
+ + StringUtils.stringifyException(e));
+ return;
}
if (parse == null) {
- return redirectedPage;
+ return;
}
final byte[] signature = sig.calculate(page);
@@ -199,24 +198,34 @@ public class ParseUtil extends Configure
int refreshTime = Integer.parseInt(ParseStatusUtils.getArg(pstatus,
1));
try {
newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
- newUrl = filters.filter(newUrl);
- } catch (URLFilterException e) {
- return redirectedPage; // TODO: is this correct
+ if (newUrl == null) {
+ LOG.warn("redirect normalized to null " + url);
+ return;
+ }
+ try {
+ newUrl = filters.filter(newUrl);
+ } catch (URLFilterException e) {
+ return;
+ }
+ if (newUrl == null) {
+ LOG.warn("redirect filtered to null " + url);
+ return;
+ }
} catch (MalformedURLException e) {
- return redirectedPage;
+ LOG.warn("malformed url exception parsing redirect " + url);
+ return;
}
+ page.putToOutlinks(new Utf8(newUrl), new Utf8());
+ page.putToMetadata(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL);
if (newUrl == null || newUrl.equals(url)) {
String reprUrl = URLUtil.chooseRepr(url, newUrl,
refreshTime < FetcherJob.PERM_REFRESH_TIME);
- WebPage newWebPage = new WebPage();
if (reprUrl == null) {
LOG.warn("reprUrl==null for " + url);
- return redirectedPage;
+ return;
} else {
page.setReprUrl(new Utf8(reprUrl));
}
- page.putToMetadata(FetcherJob.REDIRECT_DISCOVERED,
TableUtil.YES_VAL);
- redirectedPage = new URLWebPage(reprUrl, newWebPage);
}
} else {
page.setText(new Utf8(parse.getText()));
@@ -246,15 +255,19 @@ public class ParseUtil extends Configure
try {
toUrl = normalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK);
toUrl = filters.filter(toUrl);
- } catch (final URLFilterException e) {
+ } catch (MalformedURLException e2) {
continue;
- }
- catch (MalformedURLException e2){
+ } catch (URLFilterException e) {
continue;
}
if (toUrl == null) {
continue;
}
+ Utf8 utf8ToUrl = new Utf8(toUrl);
+ if (page.getFromOutlinks(utf8ToUrl) != null) {
+ // skip duplicate outlinks
+ continue;
+ }
String toHost;
if (ignoreExternalLinks) {
try {
@@ -267,7 +280,7 @@ public class ParseUtil extends Configure
}
}
- page.putToOutlinks(new Utf8(toUrl), new
Utf8(outlinks[i].getAnchor()));
+ page.putToOutlinks(utf8ToUrl, new Utf8(outlinks[i].getAnchor()));
}
Utf8 fetchMark = Mark.FETCH_MARK.checkMark(page);
if (fetchMark != null) {
@@ -275,6 +288,5 @@ public class ParseUtil extends Configure
}
}
}
- return redirectedPage;
}
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java?rev=1379438&r1=1379437&r2=1379438&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java Fri Aug
31 13:02:32 2012
@@ -126,17 +126,13 @@ public class ParserJob extends NutchTool
}
- URLWebPage redirectedPage = parseUtil.process(key, page);
+ parseUtil.process(key, page);
ParseStatus pstatus = page.getParseStatus();
if (pstatus != null) {
context.getCounter("ParserStatus",
ParseStatusCodes.majorCodes[pstatus.getMajorCode()]).increment(1);
}
- if (redirectedPage != null) {
- context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
- redirectedPage.getDatum());
- }
context.write(key, page);
}
}