Hey Markus, Great work with this one.
I notice that you did not add <ant dir="urlfilter-domainblacklist" target="test" /> to nutch/trunk/src/plugin/build.xml Lewis On Thu, Feb 23, 2012 at 12:32 PM, <[email protected]> wrote: > Author: markus > Date: Thu Feb 23 12:32:49 2012 > New Revision: 1292764 > > URL: http://svn.apache.org/viewvc?rev=1292764&view=rev > Log: > NUTCH-1210 Domain Blacklist Filter > > Added: > nutch/trunk/conf/domainblacklist-urlfilter.txt > nutch/trunk/src/plugin/urlfilter-domainblacklist/ > nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml > nutch/trunk/src/plugin/urlfilter-domainblacklist/data/ > nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt > nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml > nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/ > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/ > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/ > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/ > > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/ > > > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/ > > > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/ > > > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/ > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/ > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/ > > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/ > > > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/ > > > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/ > > > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java > Modified: > nutch/trunk/CHANGES.txt > nutch/trunk/src/plugin/build.xml > > Modified: nutch/trunk/CHANGES.txt > URL: > http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1292764&r1=1292763&r2=1292764&view=diff > > ============================================================================== > --- nutch/trunk/CHANGES.txt (original) > +++ nutch/trunk/CHANGES.txt Thu Feb 23 12:32:49 2012 > @@ -1,5 +1,7 @@ > Nutch Change Log > > +* NUTCH-1210 DomainBlacklistFilter (markus) > + > * NUTCH-965 Skip parsing for truncated documents (alexis, lewismc, ferdy) > > * NUTCH-1193 Incorrect url transform to lowercase: parameter solr > (Eduardo dos Santos Leggiero via lewismc) > > Added: nutch/trunk/conf/domainblacklist-urlfilter.txt > URL: > http://svn.apache.org/viewvc/nutch/trunk/conf/domainblacklist-urlfilter.txt?rev=1292764&view=auto > > ============================================================================== > --- nutch/trunk/conf/domainblacklist-urlfilter.txt (added) > +++ nutch/trunk/conf/domainblacklist-urlfilter.txt Thu Feb 23 12:32:49 2012 > @@ -0,0 +1,16 @@ > +# Licensed to the Apache Software Foundation (ASF) under one or more > +# contributor license agreements. See the NOTICE file distributed with > +# this work for additional information regarding copyright ownership. > +# The ASF licenses this file to You under the Apache License, Version 2.0 > +# (the "License"); you may not use this file except in compliance with > +# the License. You may obtain a copy of the License at > +# > +# http://www.apache.org/licenses/LICENSE-2.0 > +# > +# Unless required by applicable law or agreed to in writing, software > +# distributed under the License is distributed on an "AS IS" BASIS, > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > +# See the License for the specific language governing permissions and > +# limitations under the License. > + > +# config file for urlfilter-domainblacklist plugin > > Modified: nutch/trunk/src/plugin/build.xml > URL: > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1292764&r1=1292763&r2=1292764&view=diff > > ============================================================================== > --- nutch/trunk/src/plugin/build.xml (original) > +++ nutch/trunk/src/plugin/build.xml Thu Feb 23 12:32:49 2012 > @@ -57,6 +57,7 @@ > <ant dir="tld" target="deploy"/> > <ant dir="urlfilter-automaton" target="deploy"/> > <ant dir="urlfilter-domain" target="deploy" /> > + <ant dir="urlfilter-domainblacklist" target="deploy" /> > <ant dir="urlfilter-prefix" target="deploy"/> > <ant dir="urlfilter-regex" target="deploy"/> > <ant dir="urlfilter-suffix" target="deploy"/> > @@ -132,6 +133,7 @@ > <ant dir="tld" target="clean"/> > <ant dir="urlfilter-automaton" target="clean"/> > <ant dir="urlfilter-domain" target="clean" /> > + <ant dir="urlfilter-domainblacklist" target="clean" /> > <ant dir="urlfilter-prefix" target="clean"/> > <ant dir="urlfilter-regex" target="clean"/> > <ant dir="urlfilter-suffix" target="clean"/> > > Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml > URL: > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml?rev=1292764&view=auto > > ============================================================================== > --- nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml (added) > +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/build.xml Thu Feb 23 > 12:32:49 2012 > @@ -0,0 +1,28 @@ > +<?xml version="1.0"?> > +<!-- > + Licensed to the Apache Software Foundation (ASF) under one or more > + contributor license agreements. See the NOTICE file distributed with > + this work for additional information regarding copyright ownership. > + The ASF licenses this file to You under the Apache License, Version 2.0 > + (the "License"); you may not use this file except in compliance with > + the License. You may obtain a copy of the License at > + > + http://www.apache.org/licenses/LICENSE-2.0 > + > + Unless required by applicable law or agreed to in writing, software > + distributed under the License is distributed on an "AS IS" BASIS, > + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + See the License for the specific language governing permissions and > + limitations under the License. > +--> > +<project name="urlfilter-domainblacklist" default="jar-core"> > + > + <import file="../build-plugin.xml"/> > + > + <!-- for junit test --> > + <mkdir dir="${build.test}/data"/> > + <copy todir="${build.test}/data"> > + <fileset dir="data" /> > + </copy> > + > +</project> > > Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt > URL: > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt?rev=1292764&view=auto > > ============================================================================== > --- nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt (added) > +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/data/hosts.txt Thu > Feb 23 12:32:49 2012 > @@ -0,0 +1,5 @@ > +# comments start with the pound sign > +net > +apache.org > +be > +www.yahoo.com > \ No newline at end of file > > Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml > URL: > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml?rev=1292764&view=auto > > ============================================================================== > --- nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml (added) > +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/ivy.xml Thu Feb 23 > 12:32:49 2012 > @@ -0,0 +1,41 @@ > +<?xml version="1.0" ?> > + > +<!-- > + Licensed to the Apache Software Foundation (ASF) under one or more > + contributor license agreements. See the NOTICE file distributed with > + this work for additional information regarding copyright ownership. > + The ASF licenses this file to You under the Apache License, Version 2.0 > + (the "License"); you may not use this file except in compliance with > + the License. You may obtain a copy of the License at > + > + http://www.apache.org/licenses/LICENSE-2.0 > + > + Unless required by applicable law or agreed to in writing, software > + distributed under the License is distributed on an "AS IS" BASIS, > + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + See the License for the specific language governing permissions and > + limitations under the License. > +--> > + > +<ivy-module version="1.0"> > + <info organisation="org.apache.nutch" module="${ant.project.name}"> > + <license name="Apache 2.0"/> > + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> > + <description> > + Apache Nutch > + </description> > + </info> > + > + <configurations> > + <include file="../../../ivy/ivy-configurations.xml"/> > + </configurations> > + > + <publications> > + <!--get the artifact from our module name--> > + <artifact conf="master"/> > + </publications> > + > + <dependencies> > + </dependencies> > + > +</ivy-module> > > Added: nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml > URL: > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml?rev=1292764&view=auto > > ============================================================================== > --- nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml (added) > +++ nutch/trunk/src/plugin/urlfilter-domainblacklist/plugin.xml Thu Feb 23 > 12:32:49 2012 > @@ -0,0 +1,43 @@ > +<?xml version="1.0" encoding="UTF-8"?> > +<!-- > + Licensed to the Apache Software Foundation (ASF) under one or more > + contributor license agreements. See the NOTICE file distributed with > + this work for additional information regarding copyright ownership. > + The ASF licenses this file to You under the Apache License, Version 2.0 > + (the "License"); you may not use this file except in compliance with > + the License. You may obtain a copy of the License at > + > + http://www.apache.org/licenses/LICENSE-2.0 > + > + Unless required by applicable law or agreed to in writing, software > + distributed under the License is distributed on an "AS IS" BASIS, > + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + See the License for the specific language governing permissions and > + limitations under the License. > +--> > +<plugin > + id="urlfilter-domainblacklist" > + name="Domain Blacklist URL Filter" > + version="1.0.0" > + provider-name="nutch.org"> > + > + <runtime> > + <library name="urlfilter-domainblacklist.jar"> > + <export name="*"/> > + </library> > + </runtime> > + > + <requires> > + <import plugin="nutch-extensionpoints"/> > + </requires> > + > + <extension id="org.apache.nutch.net.urlfilter.domainblacklist" > + name="Nutch Domain Blacklist URL Filter" > + point="org.apache.nutch.net.URLFilter"> > + <implementation id="DomainBlacklistURLFilter" > + > class="org.apache.nutch.urlfilter.domainblacklist.DomainBlacklistURLFilter"> > + <parameter name="file" value="domainblacklist-urlfilter.txt"/> > + </implementation> > + </extension> > + > +</plugin> > > Added: > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java > URL: > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java?rev=1292764&view=auto > > ============================================================================== > --- > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java > (added) > +++ > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java > Thu Feb 23 12:32:49 2012 > @@ -0,0 +1,203 @@ > +/* > + * Licensed to the Apache Software Foundation (ASF) under one or more > + * contributor license agreements. See the NOTICE file distributed with > + * this work for additional information regarding copyright ownership. > + * The ASF licenses this file to You under the Apache License, Version 2.0 > + * (the "License"); you may not use this file except in compliance with > + * the License. You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > +package org.apache.nutch.urlfilter.domainblacklist; > + > +import java.io.BufferedReader; > +import java.io.FileReader; > +import java.io.IOException; > +import java.io.Reader; > +import java.io.StringReader; > +import java.util.LinkedHashSet; > +import java.util.Set; > + > +import org.apache.commons.lang.StringUtils; > +import org.slf4j.Logger; > +import org.slf4j.LoggerFactory; > +import org.apache.hadoop.conf.Configuration; > +import org.apache.nutch.net.URLFilter; > +import org.apache.nutch.plugin.Extension; > +import org.apache.nutch.plugin.PluginRepository; > +import org.apache.nutch.util.URLUtil; > +import org.apache.nutch.util.domain.DomainSuffix; > + > +/** > + * <p>Filters URLs based on a file containing domain suffixes, domain > names, and > + * hostnames. A url that matches one of the suffixes, domains, or hosts > + * present in the file is filtered out.</p> > + * > + * <p>Urls are checked in order of domain suffix, domain name, and > hostname > + * against entries in the domain file. The domain file would be setup as > follows > + * with one entry per line: > + * > + * <pre> com apache.org www.apache.org </pre> > + * > + * <p>The first line is an example of a filter that would allow all .com > + * domains. The second line allows all urls from apache.org and all of > its > + * subdomains such as lucene.apache.org and hadoop.apache.org. The third > line > + * would allow only urls from www.apache.org. There is no specific > ordering to > + * entries. The entries are from more general to more specific with the > more > + * general overridding the more specific.</p> > + * > + * The domain file defaults to domainblacklist-urlfilter.txt in the > classpath but can be > + * overridden using the: > + * > + * <ul> <ol>property "urlfilter.domainblacklist.file" in > ./conf/nutch-*.xml, and</ol> > + * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul> > + * > + * the attribute "file" has higher precedence if defined. > + */ > +public class DomainBlacklistURLFilter > + implements URLFilter { > + > + private static final Logger LOG = > LoggerFactory.getLogger(DomainBlacklistURLFilter.class); > + > + // read in attribute "file" of this plugin. > + private static String attributeFile = null; > + private Configuration conf; > + private String domainFile = null; > + private Set<String> domainSet = new LinkedHashSet<String>(); > + > + private void readConfiguration(Reader configReader) > + throws IOException { > + > + // read the configuration file, line by line > + BufferedReader reader = new BufferedReader(configReader); > + String line = null; > + while ((line = reader.readLine()) != null) { > + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { > + // add non-blank lines and non-commented lines > + domainSet.add(StringUtils.lowerCase(line)); > + } > + } > + } > + > + /** > + * Default constructor. > + */ > + public DomainBlacklistURLFilter() { > + > + } > + > + /** > + * Constructor that specifies the domain file to use. > + * > + * @param domainFile The domain file, overrides > domainblacklist-urlfilter.text default. > + * > + * @throws IOException > + */ > + public DomainBlacklistURLFilter(String domainFile) { > + this.domainFile = domainFile; > + } > + > + /** > + * Sets the configuration. > + */ > + public void setConf(Configuration conf) { > + this.conf = conf; > + > + // get the extensions for domain urlfilter > + String pluginName = "urlfilter-domainblacklist"; > + Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( > + URLFilter.class.getName()).getExtensions(); > + for (int i = 0; i < extensions.length; i++) { > + Extension extension = extensions[i]; > + if (extension.getDescriptor().getPluginId().equals(pluginName)) { > + attributeFile = extension.getAttribute("file"); > + break; > + } > + } > + > + // handle blank non empty input > + if (attributeFile != null && attributeFile.trim().equals("")) { > + attributeFile = null; > + } > + > + if (attributeFile != null) { > + if (LOG.isInfoEnabled()) { > + LOG.info("Attribute \"file\" is defined for plugin " + pluginName > + + " as " + attributeFile); > + } > + } > + else { > + if (LOG.isWarnEnabled()) { > + LOG.warn("Attribute \"file\" is not defined in plugin.xml for > plugin " > + + pluginName); > + } > + } > + > + // domain file and attribute "file" take precedence if defined > + String file = conf.get("urlfilter.domainblacklist.file"); > + String stringRules = conf.get("urlfilter.domainblacklist.rules"); > + if (domainFile != null) { > + file = domainFile; > + } > + else if (attributeFile != null) { > + file = attributeFile; > + } > + Reader reader = null; > + if (stringRules != null) { // takes precedence over files > + reader = new StringReader(stringRules); > + } else { > + reader = conf.getConfResourceAsReader(file); > + } > + try { > + if (reader == null) { > + reader = new FileReader(file); > + } > + readConfiguration(reader); > + } > + catch (IOException e) { > + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); > + } > + } > + > + public Configuration getConf() { > + return this.conf; > + } > + > + public String filter(String url) { > + > + try { > + > + // match for suffix, domain, and host in that order. more general > will > + // override more specific > + String domain = URLUtil.getDomainName(url).toLowerCase().trim(); > + String host = URLUtil.getHost(url); > + String suffix = null; > + DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url); > + if (domainSuffix != null) { > + suffix = domainSuffix.getDomain(); > + } > + > + if (domainSet.contains(suffix) || domainSet.contains(domain) > + || domainSet.contains(host)) { > + // Matches, filter! > + return null; > + } > + > + // doesn't match, allow > + return url; > + } > + catch (Exception e) { > + > + // if an error happens, allow the url to pass > + LOG.error("Could not apply filter on url: " + url + "\n" > + + org.apache.hadoop.util.StringUtils.stringifyException(e)); > + return null; > + } > + } > +} > > Added: > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java > URL: > http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java?rev=1292764&view=auto > > ============================================================================== > --- > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java > (added) > +++ > nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java > Thu Feb 23 12:32:49 2012 > @@ -0,0 +1,57 @@ > +/* > + * Licensed to the Apache Software Foundation (ASF) under one or more > + * contributor license agreements. See the NOTICE file distributed with > + * this work for additional information regarding copyright ownership. > + * The ASF licenses this file to You under the Apache License, Version 2.0 > + * (the "License"); you may not use this file except in compliance with > + * the License. You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > +package org.apache.nutch.urlfilter.domainblacklist; > + > +import junit.framework.TestCase; > + > +import org.slf4j.Logger; > +import org.slf4j.LoggerFactory; > +import org.apache.hadoop.conf.Configuration; > +import org.apache.nutch.util.NutchConfiguration; > + > +public class TestDomainBlacklistURLFilter > + extends TestCase { > + > + protected static final Logger LOG = > LoggerFactory.getLogger(TestDomainBlacklistURLFilter.class); > + > + private final static String SEPARATOR = > System.getProperty("file.separator"); > + private final static String SAMPLES = System.getProperty("test.data", > "."); > + > + public TestDomainBlacklistURLFilter(String testName) { > + super(testName); > + } > + > + public void testFilter() > + throws Exception { > + > + String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt"; > + Configuration conf = NutchConfiguration.create(); > + DomainBlacklistURLFilter domainBlacklistFilter = new > DomainBlacklistURLFilter(domainBlacklistFile); > + domainBlacklistFilter.setConf(conf); > + assertNull(domainBlacklistFilter.filter("http://lucene.apache.org")); > + assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org")); > + assertNull(domainBlacklistFilter.filter("http://www.apache.org")); > + assertNotNull(domainBlacklistFilter.filter("http://www.google.com")); > + assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com")); > + assertNull(domainBlacklistFilter.filter("http://www.foobar.net")); > + assertNull(domainBlacklistFilter.filter("http://www.foobas.net")); > + assertNull(domainBlacklistFilter.filter("http://www.yahoo.com")); > + assertNull(domainBlacklistFilter.filter("http://www.foobar.be")); > + assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com")); > + } > + > +} > > > -- *Lewis*

