Author: markus Date: Thu Feb 12 08:42:49 2015 New Revision: 1659169 URL: http://svn.apache.org/r1659169 Log: NUTCH-1913 LinkDB to implement db.ignore.external.links
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1659169&r1=1659168&r2=1659169&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Feb 12 08:42:49 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1913 LinkDB to implement db.ignore.external.links (markus, snagel) + * NUTCH-1925 Upgrade to Apache Tika 1.7 (Tyler Palsulich via markus) * NUTCH-1323 AjaxNormalizer (markus) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1659169&r1=1659168&r2=1659169&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Thu Feb 12 08:42:49 2015 @@ -49,12 +49,14 @@ public class LinkDb extends Configured i public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class); public static final String IGNORE_INTERNAL_LINKS = "db.ignore.internal.links"; + public static final String IGNORE_EXTERNAL_LINKS = "db.ignore.external.links"; public static final String CURRENT_NAME = "current"; public static final String LOCK_NAME = ".locked"; private int maxAnchorLength; private boolean ignoreInternalLinks; + private boolean ignoreExternalLinks; private URLFilters urlFilters; private URLNormalizers urlNormalizers; @@ -68,6 +70,8 @@ public class LinkDb extends Configured i public void configure(JobConf job) { maxAnchorLength = job.getInt("db.max.anchor.length", 100); ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true); + ignoreExternalLinks = job.getBoolean(IGNORE_EXTERNAL_LINKS, false); + if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) { urlFilters = new URLFilters(job); } @@ -115,6 +119,11 @@ public class LinkDb extends Configured i if (toHost == null || toHost.equals(fromHost)) { // internal link continue; // skip it } + } else if (ignoreExternalLinks) { + String toHost = getHost(toUrl); + if (toHost == null || !toHost.equals(fromHost)) { // external link + continue; // skip it + } } if (urlNormalizers != null) { try { @@ -180,6 +189,15 @@ public class LinkDb extends Configured i if (job.getBoolean(IGNORE_INTERNAL_LINKS, true)) { LOG.info("LinkDb: internal links will be ignored."); } + if (job.getBoolean(IGNORE_EXTERNAL_LINKS, false)) { + LOG.info("LinkDb: external links will be ignored."); + } + } + if (job.getBoolean(IGNORE_INTERNAL_LINKS, true) + && job.getBoolean(IGNORE_EXTERNAL_LINKS, false)) { + LOG.warn("LinkDb: internal and external links are ignored! " + + "Nothing to do, actually. Exiting."); + return; } for (int i = 0; i < segments.length; i++) { @@ -291,7 +309,6 @@ public class LinkDb extends Configured i System.err.println("\t-noFilter\tdon't apply URLFilters to link URLs"); return -1; } - Path segDir = null; final FileSystem fs = FileSystem.get(getConf()); Path db = new Path(args[0]); ArrayList<Path> segs = new ArrayList<Path>();