Author: jnioche
Date: Tue Jul 15 11:32:32 2014
New Revision: 1610659
URL: http://svn.apache.org/r1610659
Log:
NUTCH-926 Redirections from META tag don't get filtered
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1610659&r1=1610658&r2=1610659&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jul 15 11:32:32 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-926 Redirections from META tag don't get filtered (snagel)
+
* NUTCH-1422 Bypass signature comparison when a document is redirected (snagel)
* NUTCH-1502 Test for CrawlDatum state transitions (snagel)
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=1610659&r1=1610658&r2=1610659&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Tue Jul
15 11:32:32 2014
@@ -165,56 +165,42 @@ public class ParseOutputFormat implement
}
if (parseMDCrawlDatum != null) crawlOut.append(key, parseMDCrawlDatum);
+ if (ignoreExternalLinks) {
+ // need to determine fromHost (once for all outlinks)
try {
- ParseStatus pstatus = parseData.getStatus();
- if (pstatus != null && pstatus.isSuccess() &&
- pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
- String newUrl = pstatus.getMessage();
- int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
-
- try {
- if(normalizers != null) {
- newUrl = normalizers.normalize(newUrl,
- URLNormalizers.SCOPE_FETCHER);
- }
- } catch (MalformedURLException mfue) {
- newUrl = null;
- }
-
- if (filters != null) {
- if (newUrl != null) newUrl = filters.filter(newUrl);
- }
+ fromHost = new URL(fromUrl).getHost().toLowerCase();
+ } catch (MalformedURLException e) {
+ fromHost = null;
+ }
+ } else {
+ fromHost = null;
+ }
- String url = key.toString();
- if (newUrl != null && !newUrl.equals(url)) {
- String reprUrl =
- URLUtil.chooseRepr(url, newUrl,
- refreshTime < Fetcher.PERM_REFRESH_TIME);
- CrawlDatum newDatum = new CrawlDatum();
- newDatum.setStatus(CrawlDatum.STATUS_LINKED);
- if (reprUrl != null && !reprUrl.equals(newUrl)) {
- newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
- new Text(reprUrl));
- }
- crawlOut.append(new Text(newUrl), newDatum);
- }
+ ParseStatus pstatus = parseData.getStatus();
+ if (pstatus != null && pstatus.isSuccess()
+ && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
+ String newUrl = pstatus.getMessage();
+ int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
+ newUrl = filterNormalize(fromUrl, newUrl, fromHost,
+ ignoreExternalLinks, filters, normalizers,
+ URLNormalizers.SCOPE_FETCHER);
+
+ if (newUrl != null) {
+ String reprUrl = URLUtil.chooseRepr(fromUrl, newUrl,
+ refreshTime < Fetcher.PERM_REFRESH_TIME);
+ CrawlDatum newDatum = new CrawlDatum();
+ newDatum.setStatus(CrawlDatum.STATUS_LINKED);
+ if (reprUrl != null && !reprUrl.equals(newUrl)) {
+ newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+ new Text(reprUrl));
}
- } catch (URLFilterException e) {
- // ignore
+ crawlOut.append(new Text(newUrl), newDatum);
}
+ }
// collect outlinks for subsequent db update
Outlink[] links = parseData.getOutlinks();
int outlinksToStore = Math.min(maxOutlinks, links.length);
- if (ignoreExternalLinks) {
- try {
- fromHost = new URL(fromUrl).getHost().toLowerCase();
- } catch (MalformedURLException e) {
- fromHost = null;
- }
- } else {
- fromHost = null;
- }
int validCount = 0;
CrawlDatum adjust = null;
@@ -299,7 +285,16 @@ public class ParseOutputFormat implement
}
- public static String filterNormalize(String fromUrl, String toUrl, String
fromHost, boolean ignoreExternalLinks, URLFilters filters, URLNormalizers
normalizers) {
+ public static String filterNormalize(String fromUrl, String toUrl,
+ String fromHost, boolean ignoreExternalLinks, URLFilters filters,
+ URLNormalizers normalizers) {
+ return filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks,
+ filters, normalizers, URLNormalizers.SCOPE_OUTLINK);
+ }
+
+ public static String filterNormalize(String fromUrl, String toUrl,
+ String fromHost, boolean ignoreExternalLinks, URLFilters filters,
+ URLNormalizers normalizers, String urlNormalizerScope) {
// ignore links to self (or anchors within the page)
if (fromUrl.equals(toUrl)) {
return null;
@@ -318,7 +313,7 @@ public class ParseOutputFormat implement
try {
if(normalizers != null) {
toUrl = normalizers.normalize(toUrl,
- URLNormalizers.SCOPE_OUTLINK); // normalize the url
+ urlNormalizerScope); // normalize the url
}
if (filters != null) {
toUrl = filters.filter(toUrl); // filter the url