Author: markus Date: Thu Dec 24 12:45:27 2015 New Revision: 1721615 URL: http://svn.apache.org/viewvc?rev=1721615&view=rev Log: NUTCH-2189 Domain filter must deactivate if no rules are present
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1721615&r1=1721614&r2=1721615&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Dec 24 12:45:27 2015 @@ -1,6 +1,8 @@ Nutch Change Log -* NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for consistency +* NUTCH-2189 Domain filter must deactivate if no rules are present (markus) + +* NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for consistency (joyce) * NUTCH-2183 Improvement to SegmentChecker for skipping non-segments present in segments directory (lewismc) Modified: nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java?rev=1721615&r1=1721614&r2=1721615&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java Thu Dec 24 12:45:27 2015 @@ -180,9 +180,10 @@ public class DomainURLFilter implements } public String filter(String url) { - + // https://issues.apache.org/jira/browse/NUTCH-2189 + if (domainSet.size() == 0) return url; + try { - // match for suffix, domain, and host in that order. more general will // override more specific String domain = URLUtil.getDomainName(url).toLowerCase().trim(); Modified: nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java?rev=1721615&r1=1721614&r2=1721615&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java (original) +++ nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java Thu Dec 24 12:45:27 2015 @@ -44,5 +44,24 @@ public class TestDomainURLFilter { Assert.assertNotNull(domainFilter.filter("http://www.foobar.be")); Assert.assertNull(domainFilter.filter("http://www.adobe.com")); } + + @Test + public void testNoFilter() throws Exception { + // https://issues.apache.org/jira/browse/NUTCH-2189 + String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt"; + Configuration conf = NutchConfiguration.create(); + DomainURLFilter domainFilter = new DomainURLFilter(domainFile); + domainFilter.setConf(conf); + Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org")); + Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org")); + Assert.assertNotNull(domainFilter.filter("http://www.apache.org")); + Assert.assertNotNull(domainFilter.filter("http://www.google.com")); + Assert.assertNotNull(domainFilter.filter("http://mail.yahoo.com")); + Assert.assertNotNull(domainFilter.filter("http://www.foobar.net")); + Assert.assertNotNull(domainFilter.filter("http://www.foobas.net")); + Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com")); + Assert.assertNotNull(domainFilter.filter("http://www.foobar.be")); + Assert.assertNotNull(domainFilter.filter("http://www.adobe.com")); + } }