Author: kubes
Date: Mon Dec 29 09:58:12 2008
New Revision: 729958

URL: http://svn.apache.org/viewvc?rev=729958&view=rev
Log:
NUTCH-668: Domain URL Filter plugin

Added:
    lucene/nutch/trunk/conf/domain-urlfilter.txt
    lucene/nutch/trunk/src/plugin/urlfilter-domain/
    lucene/nutch/trunk/src/plugin/urlfilter-domain/build.xml
    lucene/nutch/trunk/src/plugin/urlfilter-domain/data/
    lucene/nutch/trunk/src/plugin/urlfilter-domain/data/hosts.txt
    lucene/nutch/trunk/src/plugin/urlfilter-domain/plugin.xml
    lucene/nutch/trunk/src/plugin/urlfilter-domain/src/
    lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/
    lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/
    lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/
    
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/
    
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/
    
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
    
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package.html
    lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/
    lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/
    lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/
    
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/
    
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/
    
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=729958&r1=729957&r2=729958&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Dec 29 09:58:12 2008
@@ -298,6 +298,8 @@
 110. NUTCH-635 -  LinkAnalysis Tool for Nutch. (kubes)
 
 111. NUTCH-646 -  New Indexing Framework for Nutch. (kubes)
+
+112. NUTCH-668 -  Domain URL Filter. (kubes)
      
 Release 0.9 - 2007-04-02
 

Added: lucene/nutch/trunk/conf/domain-urlfilter.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/domain-urlfilter.txt?rev=729958&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/domain-urlfilter.txt (added)
+++ lucene/nutch/trunk/conf/domain-urlfilter.txt Mon Dec 29 09:58:12 2008
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# config file for urlfilter-domsin plugin
\ No newline at end of file

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=729958&r1=729957&r2=729958&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Mon Dec 29 09:58:12 2008
@@ -75,6 +75,7 @@
      <ant dir="summary-lucene" target="deploy"/>
      <ant dir="tld" target="deploy"/>
      <ant dir="urlfilter-automaton" target="deploy"/>
+     <ant dir="urlfilter-domain" target="deploy" />
      <ant dir="urlfilter-prefix" target="deploy"/>
      <ant dir="urlfilter-regex" target="deploy"/>
      <ant dir="urlfilter-suffix" target="deploy"/>
@@ -109,6 +110,7 @@
      <ant dir="parse-zip" target="test"/>
      <ant dir="query-url" target="test"/>
      <ant dir="urlfilter-automaton" target="test"/>
+     <ant dir="urlfilter-domain" target="test" />
      <ant dir="urlfilter-regex" target="test"/>
      <ant dir="urlfilter-suffix" target="test"/>
      <ant dir="urlnormalizer-basic" target="test"/>
@@ -173,6 +175,7 @@
     <ant dir="summary-lucene" target="clean"/>
     <ant dir="tld" target="clean"/>
     <ant dir="urlfilter-automaton" target="clean"/>
+    <ant dir="urlfilter-domain" target="clean" />
     <ant dir="urlfilter-prefix" target="clean"/>
     <ant dir="urlfilter-regex" target="clean"/>
     <ant dir="urlfilter-suffix" target="clean"/>

Added: lucene/nutch/trunk/src/plugin/urlfilter-domain/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-domain/build.xml?rev=729958&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-domain/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-domain/build.xml Mon Dec 29 
09:58:12 2008
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-domain" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+
+</project>

Added: lucene/nutch/trunk/src/plugin/urlfilter-domain/data/hosts.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-domain/data/hosts.txt?rev=729958&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-domain/data/hosts.txt (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-domain/data/hosts.txt Mon Dec 29 
09:58:12 2008
@@ -0,0 +1,5 @@
+# comments start with the pound sign
+net
+apache.org
+be
+www.yahoo.com
\ No newline at end of file

Added: lucene/nutch/trunk/src/plugin/urlfilter-domain/plugin.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-domain/plugin.xml?rev=729958&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-domain/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-domain/plugin.xml Mon Dec 29 
09:58:12 2008
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-domain"
+   name="Domain URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-domain.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.domain"
+              name="Nutch Domain URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="DomainURLFilter"
+        class="org.apache.nutch.urlfilter.domain.DomainURLFilter">
+        <!-- <parameter name="file" value="domain-urlfilter.txt"/> -->
+      </implementation>
+   </extension>
+
+</plugin>

Added: 
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java?rev=729958&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
 Mon Dec 29 09:58:12 2008
@@ -0,0 +1,176 @@
+package org.apache.nutch.urlfilter.domain;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * <p>Filters URLs based on a file containing domain suffixes, domain names, 
and
+ * hostnames. Only a url that matches one of the suffixes, domains, or hosts
+ * present in the file is allowed.</p>
+ * 
+ * <p>Urls are checked in order of domain suffix, domain name, and hostname
+ * against entries in the domain file. The domain file would be setup as 
follows
+ * with one entry per line:
+ * 
+ * <pre> com apache.org www.apache.org </pre>
+ * 
+ * <p>The first line is an example of a filter that would allow all .com
+ * domains. The second line allows all urls from apache.org and all of its
+ * subdomains such as lucene.apache.org and hadoop.apache.org. The third line
+ * would allow only urls from www.apache.org. There is no specific ordering to
+ * entries. The entries are from more general to more specific with the more
+ * general overridding the more specific.</p>
+ * 
+ * The domain file defaults to domain-urlfilter.txt in the classpath but can be
+ * overridden using the:
+ * 
+ * <ul> <ol>property "urlfilter.domain.file" in ./conf/nutch-*.xml, and</ol>
+ * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul>
+ * 
+ * the attribute "file" has higher precedence if defined.
+ */
+public class DomainURLFilter
+  implements URLFilter {
+
+  private static final Log LOG = LogFactory.getLog(DomainURLFilter.class);
+
+  // read in attribute "file" of this plugin.
+  private static String attributeFile = null;
+  private Configuration conf;
+  private String domainFile = null;
+  private Set<String> domainSet = new LinkedHashSet<String>();
+
+  private void readConfigurationFile(Reader configReader)
+    throws IOException {
+
+    // read the configuration file, line by line
+    BufferedReader reader = new BufferedReader(configReader);
+    String line = null;
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        // add non-blank lines and non-commented lines
+        domainSet.add(StringUtils.lowerCase(line));
+      }
+    }
+  }
+
+  /**
+   * Default constructor.
+   */
+  public DomainURLFilter() {
+
+  }
+
+  /**
+   * Constructor that specifies the domain file to use.
+   * 
+   * @param domainFile The domain file, overrides domain-urlfilter.text 
default.
+   * 
+   * @throws IOException
+   */
+  public DomainURLFilter(String domainFile) {
+    this.domainFile = domainFile;
+  }
+
+  /**
+   * Sets the configuration.
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // get the extensions for domain urlfilter
+    String pluginName = "urlfilter-domain";
+    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
+      URLFilter.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+    
+    // handle blank non empty input
+    if (attributeFile != null && attributeFile.trim().equals("")) {
+      attributeFile = null;
+    }
+    
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+          + " as " + attributeFile);
+      }
+    }
+    else {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+          + pluginName);
+      }
+    }
+
+    // domain file and attribute "file" take precedence if defined
+    String file = conf.get("urlfilter.domain.file");    
+    if (domainFile != null) {
+      file = domainFile;
+    }
+    else if (attributeFile != null) {
+      file = attributeFile;
+    }
+
+    // get the file as a classpath resource and populate the domain set with
+    // the domains from the file
+    try {
+      Reader reader = conf.getConfResourceAsReader(file);
+      if (reader == null) {
+        reader = new FileReader(file);
+      }
+      readConfigurationFile(reader);
+    }
+    catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public String filter(String url) {
+
+    try {
+
+      // match for suffix, domain, and host in that order.  more general will
+      // override more specific
+      String suffix = URLUtil.getDomainSuffix(url).getDomain();
+      String domain = URLUtil.getDomainName(url).toLowerCase().trim();
+      String host = URLUtil.getHost(url);
+      if (domainSet.contains(suffix) || domainSet.contains(domain)
+        || domainSet.contains(host)) {
+        return url;
+      }
+
+      // doesn't match, don't allow
+      return null;
+    }
+    catch (Exception e) {
+      
+      // if an error happens, allow the url to pass
+      LOG.error("Could not apply filter on url: " + url + "\n"
+        + org.apache.hadoop.util.StringUtils.stringifyException(e));
+      return null;
+    }
+  }
+}

Added: 
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package.html?rev=729958&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package.html
 (added)
+++ 
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package.html
 Mon Dec 29 09:58:12 2008
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>A url filter plugin that filters by domain.</p><p></p>
+</body>
+</html>

Added: 
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java?rev=729958&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
 Mon Dec 29 09:58:12 2008
@@ -0,0 +1,41 @@
+package org.apache.nutch.urlfilter.domain;
+
+import junit.framework.TestCase;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+public class TestDomainURLFilter
+  extends TestCase {
+
+  protected static final Log LOG = 
LogFactory.getLog(TestDomainURLFilter.class);
+
+  private final static String SEPARATOR = System.getProperty("file.separator");
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+
+  public TestDomainURLFilter(String testName) {
+    super(testName);
+  }
+
+  public void testFilter()
+    throws Exception {
+
+    String domainFile = SAMPLES + SEPARATOR + "hosts.txt";
+    Configuration conf = NutchConfiguration.create();
+    DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+    domainFilter.setConf(conf);
+    assertNotNull(domainFilter.filter("http://lucene.apache.org";));
+    assertNotNull(domainFilter.filter("http://hadoop.apache.org";));
+    assertNotNull(domainFilter.filter("http://www.apache.org";));
+    assertNull(domainFilter.filter("http://www.google.com";));
+    assertNull(domainFilter.filter("http://mail.yahoo.com";));
+    assertNotNull(domainFilter.filter("http://www.foobar.net";));
+    assertNotNull(domainFilter.filter("http://www.foobas.net";));
+    assertNotNull(domainFilter.filter("http://www.yahoo.com";));
+    assertNotNull(domainFilter.filter("http://www.foobar.be";));
+    assertNull(domainFilter.filter("http://www.adobe.com";));
+  }
+
+}


Reply via email to