Author: kubes Date: Mon Dec 29 09:58:12 2008 New Revision: 729958 URL: http://svn.apache.org/viewvc?rev=729958&view=rev Log: NUTCH-668: Domain URL Filter plugin
Added: lucene/nutch/trunk/conf/domain-urlfilter.txt lucene/nutch/trunk/src/plugin/urlfilter-domain/ lucene/nutch/trunk/src/plugin/urlfilter-domain/build.xml lucene/nutch/trunk/src/plugin/urlfilter-domain/data/ lucene/nutch/trunk/src/plugin/urlfilter-domain/data/hosts.txt lucene/nutch/trunk/src/plugin/urlfilter-domain/plugin.xml lucene/nutch/trunk/src/plugin/urlfilter-domain/src/ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package.html lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=729958&r1=729957&r2=729958&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Dec 29 09:58:12 2008 @@ -298,6 +298,8 @@ 110. NUTCH-635 - LinkAnalysis Tool for Nutch. (kubes) 111. NUTCH-646 - New Indexing Framework for Nutch. (kubes) + +112. NUTCH-668 - Domain URL Filter. (kubes) Release 0.9 - 2007-04-02 Added: lucene/nutch/trunk/conf/domain-urlfilter.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/domain-urlfilter.txt?rev=729958&view=auto ============================================================================== --- lucene/nutch/trunk/conf/domain-urlfilter.txt (added) +++ lucene/nutch/trunk/conf/domain-urlfilter.txt Mon Dec 29 09:58:12 2008 @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# config file for urlfilter-domsin plugin \ No newline at end of file Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=729958&r1=729957&r2=729958&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Mon Dec 29 09:58:12 2008 @@ -75,6 +75,7 @@ <ant dir="summary-lucene" target="deploy"/> <ant dir="tld" target="deploy"/> <ant dir="urlfilter-automaton" target="deploy"/> + <ant dir="urlfilter-domain" target="deploy" /> <ant dir="urlfilter-prefix" target="deploy"/> <ant dir="urlfilter-regex" target="deploy"/> <ant dir="urlfilter-suffix" target="deploy"/> @@ -109,6 +110,7 @@ <ant dir="parse-zip" target="test"/> <ant dir="query-url" target="test"/> <ant dir="urlfilter-automaton" target="test"/> + <ant dir="urlfilter-domain" target="test" /> <ant dir="urlfilter-regex" target="test"/> <ant dir="urlfilter-suffix" target="test"/> <ant dir="urlnormalizer-basic" target="test"/> @@ -173,6 +175,7 @@ <ant dir="summary-lucene" target="clean"/> <ant dir="tld" target="clean"/> <ant dir="urlfilter-automaton" target="clean"/> + <ant dir="urlfilter-domain" target="clean" /> <ant dir="urlfilter-prefix" target="clean"/> <ant dir="urlfilter-regex" target="clean"/> <ant dir="urlfilter-suffix" target="clean"/> Added: lucene/nutch/trunk/src/plugin/urlfilter-domain/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-domain/build.xml?rev=729958&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-domain/build.xml (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-domain/build.xml Mon Dec 29 09:58:12 2008 @@ -0,0 +1,28 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlfilter-domain" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="data" /> + </copy> + +</project> Added: lucene/nutch/trunk/src/plugin/urlfilter-domain/data/hosts.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-domain/data/hosts.txt?rev=729958&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-domain/data/hosts.txt (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-domain/data/hosts.txt Mon Dec 29 09:58:12 2008 @@ -0,0 +1,5 @@ +# comments start with the pound sign +net +apache.org +be +www.yahoo.com \ No newline at end of file Added: lucene/nutch/trunk/src/plugin/urlfilter-domain/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-domain/plugin.xml?rev=729958&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-domain/plugin.xml (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-domain/plugin.xml Mon Dec 29 09:58:12 2008 @@ -0,0 +1,43 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlfilter-domain" + name="Domain URL Filter" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlfilter-domain.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlfilter.domain" + name="Nutch Domain URL Filter" + point="org.apache.nutch.net.URLFilter"> + <implementation id="DomainURLFilter" + class="org.apache.nutch.urlfilter.domain.DomainURLFilter"> + <!-- <parameter name="file" value="domain-urlfilter.txt"/> --> + </implementation> + </extension> + +</plugin> Added: lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java?rev=729958&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java Mon Dec 29 09:58:12 2008 @@ -0,0 +1,176 @@ +package org.apache.nutch.urlfilter.domain; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.util.LinkedHashSet; +import java.util.Set; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLFilter; +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.util.URLUtil; + +/** + * <p>Filters URLs based on a file containing domain suffixes, domain names, and + * hostnames. Only a url that matches one of the suffixes, domains, or hosts + * present in the file is allowed.</p> + * + * <p>Urls are checked in order of domain suffix, domain name, and hostname + * against entries in the domain file. The domain file would be setup as follows + * with one entry per line: + * + * <pre> com apache.org www.apache.org </pre> + * + * <p>The first line is an example of a filter that would allow all .com + * domains. The second line allows all urls from apache.org and all of its + * subdomains such as lucene.apache.org and hadoop.apache.org. The third line + * would allow only urls from www.apache.org. There is no specific ordering to + * entries. The entries are from more general to more specific with the more + * general overridding the more specific.</p> + * + * The domain file defaults to domain-urlfilter.txt in the classpath but can be + * overridden using the: + * + * <ul> <ol>property "urlfilter.domain.file" in ./conf/nutch-*.xml, and</ol> + * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul> + * + * the attribute "file" has higher precedence if defined. + */ +public class DomainURLFilter + implements URLFilter { + + private static final Log LOG = LogFactory.getLog(DomainURLFilter.class); + + // read in attribute "file" of this plugin. + private static String attributeFile = null; + private Configuration conf; + private String domainFile = null; + private Set<String> domainSet = new LinkedHashSet<String>(); + + private void readConfigurationFile(Reader configReader) + throws IOException { + + // read the configuration file, line by line + BufferedReader reader = new BufferedReader(configReader); + String line = null; + while ((line = reader.readLine()) != null) { + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { + // add non-blank lines and non-commented lines + domainSet.add(StringUtils.lowerCase(line)); + } + } + } + + /** + * Default constructor. + */ + public DomainURLFilter() { + + } + + /** + * Constructor that specifies the domain file to use. + * + * @param domainFile The domain file, overrides domain-urlfilter.text default. + * + * @throws IOException + */ + public DomainURLFilter(String domainFile) { + this.domainFile = domainFile; + } + + /** + * Sets the configuration. + */ + public void setConf(Configuration conf) { + this.conf = conf; + + // get the extensions for domain urlfilter + String pluginName = "urlfilter-domain"; + Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( + URLFilter.class.getName()).getExtensions(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + if (extension.getDescriptor().getPluginId().equals(pluginName)) { + attributeFile = extension.getAttribute("file"); + break; + } + } + + // handle blank non empty input + if (attributeFile != null && attributeFile.trim().equals("")) { + attributeFile = null; + } + + if (attributeFile != null) { + if (LOG.isInfoEnabled()) { + LOG.info("Attribute \"file\" is defined for plugin " + pluginName + + " as " + attributeFile); + } + } + else { + if (LOG.isWarnEnabled()) { + LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " + + pluginName); + } + } + + // domain file and attribute "file" take precedence if defined + String file = conf.get("urlfilter.domain.file"); + if (domainFile != null) { + file = domainFile; + } + else if (attributeFile != null) { + file = attributeFile; + } + + // get the file as a classpath resource and populate the domain set with + // the domains from the file + try { + Reader reader = conf.getConfResourceAsReader(file); + if (reader == null) { + reader = new FileReader(file); + } + readConfigurationFile(reader); + } + catch (IOException e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } + + public Configuration getConf() { + return this.conf; + } + + public String filter(String url) { + + try { + + // match for suffix, domain, and host in that order. more general will + // override more specific + String suffix = URLUtil.getDomainSuffix(url).getDomain(); + String domain = URLUtil.getDomainName(url).toLowerCase().trim(); + String host = URLUtil.getHost(url); + if (domainSet.contains(suffix) || domainSet.contains(domain) + || domainSet.contains(host)) { + return url; + } + + // doesn't match, don't allow + return null; + } + catch (Exception e) { + + // if an error happens, allow the url to pass + LOG.error("Could not apply filter on url: " + url + "\n" + + org.apache.hadoop.util.StringUtils.stringifyException(e)); + return null; + } + } +} Added: lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package.html?rev=729958&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package.html (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package.html Mon Dec 29 09:58:12 2008 @@ -0,0 +1,5 @@ +<html> +<body> +<p>A url filter plugin that filters by domain.</p><p></p> +</body> +</html> Added: lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java?rev=729958&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java Mon Dec 29 09:58:12 2008 @@ -0,0 +1,41 @@ +package org.apache.nutch.urlfilter.domain; + +import junit.framework.TestCase; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +public class TestDomainURLFilter + extends TestCase { + + protected static final Log LOG = LogFactory.getLog(TestDomainURLFilter.class); + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + public TestDomainURLFilter(String testName) { + super(testName); + } + + public void testFilter() + throws Exception { + + String domainFile = SAMPLES + SEPARATOR + "hosts.txt"; + Configuration conf = NutchConfiguration.create(); + DomainURLFilter domainFilter = new DomainURLFilter(domainFile); + domainFilter.setConf(conf); + assertNotNull(domainFilter.filter("http://lucene.apache.org")); + assertNotNull(domainFilter.filter("http://hadoop.apache.org")); + assertNotNull(domainFilter.filter("http://www.apache.org")); + assertNull(domainFilter.filter("http://www.google.com")); + assertNull(domainFilter.filter("http://mail.yahoo.com")); + assertNotNull(domainFilter.filter("http://www.foobar.net")); + assertNotNull(domainFilter.filter("http://www.foobas.net")); + assertNotNull(domainFilter.filter("http://www.yahoo.com")); + assertNotNull(domainFilter.filter("http://www.foobar.be")); + assertNull(domainFilter.filter("http://www.adobe.com")); + } + +}