Author: jnioche
Date: Fri Nov 20 16:21:46 2015
New Revision: 1715386
URL: http://svn.apache.org/viewvc?rev=1715386&view=rev
Log:
NUTCH-2069
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1715386&r1=1715385&r2=1715386&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Nov 20 16:21:46 2015
@@ -3,6 +3,8 @@ Nutch Change Log
Nutch 1.11 Release 25/10/2015 (dd/mm/yyyy)
Release Report: http://s.apache.org/nutch11
+* NUTCH-2069 Ignore external links based on domain (jnioche)
+
* NUTCH-2173 String.join in FileDumper breaks the build (joyce)
* NUTCH-2166 Add reverse URL format to dump tool (joyce)
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1715386&r1=1715385&r2=1715386&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Nov 20 16:21:46 2015
@@ -548,12 +548,19 @@
<property>
<name>db.ignore.external.links</name>
<value>false</value>
- <description>If true, outlinks leading from a page to external hosts
+ <description>If true, outlinks leading from a page to external hosts or
domain
will be ignored. This is an effective way to limit the crawl to include
only initially injected hosts, without creating complex URLFilters.
+ See 'db.ignore.external.links.mode'.
</description>
</property>
+<property>
+ <name>db.ignore.external.links.mode</name>
+ <value>byHost</value>
+ <description>Alternative value is byDomain</description>
+</property>
+
<property>
<name>db.injector.overwrite</name>
<value>false</value>
Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java?rev=1715386&r1=1715385&r2=1715386&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java Fri Nov 20
16:21:46 2015
@@ -85,6 +85,7 @@ public class FetcherThread extends Threa
private boolean redirecting;
private int redirectCount;
private boolean ignoreExternalLinks;
+ private String ignoreExternalLinksMode;
// Used by fetcher.follow.outlinks.depth in parse
private int maxOutlinksPerPage;
@@ -168,14 +169,13 @@ public class FetcherThread extends Threa
}
LOG.info("Using queue mode : " + queueMode);
this.maxRedirect = conf.getInt("http.redirect.max", 3);
- this.ignoreExternalLinks = conf.getBoolean("db.ignore.external.links",
- false);
maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
: maxOutlinksPerPage;
interval = conf.getInt("db.fetch.interval.default", 2592000);
ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
+ ignoreExternalLinksMode = conf.get("db.ignore.external.links.mode",
"byHost");
maxOutlinkDepth = conf.getInt("fetcher.follow.outlinks.depth", -1);
outlinksIgnoreExternal = conf.getBoolean(
"fetcher.follow.outlinks.ignore.external", false);
@@ -616,19 +616,21 @@ public class FetcherThread extends Threa
}
}
- String fromHost;
+ String origin = null;
// collect outlinks for subsequent db update
Outlink[] links = parseData.getOutlinks();
int outlinksToStore = Math.min(maxOutlinks, links.length);
if (ignoreExternalLinks) {
- try {
- fromHost = new URL(url.toString()).getHost().toLowerCase();
- } catch (MalformedURLException e) {
- fromHost = null;
+ URL originURL = new URL(url.toString());
+ // based on domain?
+ if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+ origin = URLUtil.getDomainName(originURL).toLowerCase();
+ }
+ // use host
+ else {
+ origin = originURL.getHost().toLowerCase();
}
- } else {
- fromHost = null;
}
//used by fetchNode
@@ -646,7 +648,7 @@ public class FetcherThread extends Threa
String toUrl = links[i].getToUrl();
toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl,
- fromHost, ignoreExternalLinks, urlFilters, normalizers);
+ origin, ignoreExternalLinks, ignoreExternalLinksMode,
urlFilters, normalizers);
if (toUrl == null) {
continue;
}
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=1715386&r1=1715385&r2=1715386&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Fri Nov
20 16:21:46 2015
@@ -104,6 +104,9 @@ public class ParseOutputFormat implement
final int interval = job.getInt("db.fetch.interval.default", 2592000);
final boolean ignoreExternalLinks = job.getBoolean(
"db.ignore.external.links", false);
+ final String ignoreExternalLinksMode = job.get(
+ "db.ignore.external.links.mode", "byHost");
+
int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100);
final boolean isParsing = job.getBoolean("fetcher.parse", true);
final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
@@ -152,7 +155,8 @@ public class ParseOutputFormat implement
public void write(Text key, Parse parse) throws IOException {
String fromUrl = key.toString();
- String fromHost = null;
+ // host or domain name of the source URL
+ String origin = null;
textOut.append(key, new ParseText(parse.getText()));
ParseData parseData = parse.getData();
@@ -184,15 +188,17 @@ public class ParseOutputFormat implement
if (parseMDCrawlDatum != null)
crawlOut.append(key, parseMDCrawlDatum);
+ // need to determine origin (once for all outlinks)
if (ignoreExternalLinks) {
- // need to determine fromHost (once for all outlinks)
- try {
- fromHost = new URL(fromUrl).getHost().toLowerCase();
- } catch (MalformedURLException e) {
- fromHost = null;
+ URL originURL = new URL(fromUrl.toString());
+ // based on domain?
+ if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+ origin = URLUtil.getDomainName(originURL).toLowerCase();
+ }
+ // use host
+ else {
+ origin = originURL.getHost().toLowerCase();
}
- } else {
- fromHost = null;
}
ParseStatus pstatus = parseData.getStatus();
@@ -200,8 +206,8 @@ public class ParseOutputFormat implement
&& pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
String newUrl = pstatus.getMessage();
int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
- newUrl = filterNormalize(fromUrl, newUrl, fromHost,
- ignoreExternalLinks, filters, normalizers,
+ newUrl = filterNormalize(fromUrl, newUrl, origin,
+ ignoreExternalLinks, ignoreExternalLinksMode, filters,
normalizers,
URLNormalizers.SCOPE_FETCHER);
if (newUrl != null) {
@@ -231,8 +237,8 @@ public class ParseOutputFormat implement
// Only normalize and filter if fetcher.parse = false
if (!isParsing) {
- toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost,
- ignoreExternalLinks, filters, normalizers);
+ toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, origin,
+ ignoreExternalLinks, ignoreExternalLinksMode, filters,
normalizers);
if (toUrl == null) {
continue;
}
@@ -310,29 +316,39 @@ public class ParseOutputFormat implement
}
public static String filterNormalize(String fromUrl, String toUrl,
- String fromHost, boolean ignoreExternalLinks, URLFilters filters,
+ String fromHost, boolean ignoreExternalLinks,
+ String ignoreExternalLinksMode, URLFilters filters,
URLNormalizers normalizers) {
return filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks,
- filters, normalizers, URLNormalizers.SCOPE_OUTLINK);
+ ignoreExternalLinksMode, filters, normalizers,
+ URLNormalizers.SCOPE_OUTLINK);
}
public static String filterNormalize(String fromUrl, String toUrl,
- String fromHost, boolean ignoreExternalLinks, URLFilters filters,
+ String origin, boolean ignoreExternalLinks, String
ignoreExternalLinksMode, URLFilters filters,
URLNormalizers normalizers, String urlNormalizerScope) {
// ignore links to self (or anchors within the page)
if (fromUrl.equals(toUrl)) {
return null;
}
if (ignoreExternalLinks) {
- String toHost;
+ URL targetURL = null;
try {
- toHost = new URL(toUrl).getHost().toLowerCase();
- } catch (MalformedURLException e) {
- toHost = null;
- }
- if (toHost == null || !toHost.equals(fromHost)) { // external links
+ targetURL = new URL(toUrl);
+ } catch (MalformedURLException e1) {
return null; // skip it
}
+ if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+ String toDomain = URLUtil.getDomainName(targetURL).toLowerCase();
+ if (toDomain == null || !toDomain.equals(origin)) {
+ return null; // skip it
+ }
+ } else {
+ String toHost = targetURL.getHost().toLowerCase();
+ if (toHost == null || !toHost.equals(origin)) {
+ return null; // skip it
+ }
+ }
}
try {
if (normalizers != null) {