Author: ab Date: Mon May 15 05:14:36 2006 New Revision: 406625 URL: http://svn.apache.org/viewcvs?rev=406625&view=rev Log: Add a suffix-based URLFilter. Correct also extension IDs for other urlfilter plugins, so that they can be active at the same time.
Added: lucene/nutch/trunk/conf/suffix-urlfilter.txt (with props) lucene/nutch/trunk/src/plugin/urlfilter-suffix/ lucene/nutch/trunk/src/plugin/urlfilter-suffix/build.xml (with props) lucene/nutch/trunk/src/plugin/urlfilter-suffix/plugin.xml (with props) lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java (with props) lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java (with props) Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=406625&r1=406624&r2=406625&view=diff ============================================================================== --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Mon May 15 05:14:36 2006 @@ -628,6 +628,13 @@ </property> <property> + <name>urlfilter.suffix.file</name> + <value>suffix-urlfilter.txt</value> + <description>Name of file on CLASSPATH containing url suffixes + used by urlfilter-suffix (SuffixURLFilter) plugin.</description> +</property> + +<property> <name>urlfilter.order</name> <value></value> <description>The order by which url filters are applied. Added: lucene/nutch/trunk/conf/suffix-urlfilter.txt URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/suffix-urlfilter.txt?rev=406625&view=auto ============================================================================== --- lucene/nutch/trunk/conf/suffix-urlfilter.txt (added) +++ lucene/nutch/trunk/conf/suffix-urlfilter.txt Mon May 15 05:14:36 2006 @@ -0,0 +1,11 @@ +# config file for urlfilter-suffix plugin + +# case-insensitive, allow unknown suffixes ++I + +# prohibit these +.gif +.jpg +.jpeg +.bmp +.png Propchange: lucene/nutch/trunk/conf/suffix-urlfilter.txt ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=406625&r1=406624&r2=406625&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Mon May 15 05:14:36 2006 @@ -56,6 +56,7 @@ <ant dir="urlfilter-automaton" target="deploy"/> <ant dir="urlfilter-prefix" target="deploy"/> <ant dir="urlfilter-regex" target="deploy"/> + <ant dir="urlfilter-suffix" target="deploy"/> </target> <!-- ====================================================== --> @@ -81,6 +82,7 @@ <ant dir="parse-zip" target="test"/> <ant dir="urlfilter-automaton" target="test"/> <ant dir="urlfilter-regex" target="test"/> + <ant dir="urlfilter-suffix" target="test"/> </parallel> </target> @@ -135,6 +137,7 @@ <ant dir="urlfilter-automaton" target="clean"/> <ant dir="urlfilter-prefix" target="clean"/> <ant dir="urlfilter-regex" target="clean"/> + <ant dir="urlfilter-suffix" target="clean"/> </target> </project> Modified: lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml?rev=406625&r1=406624&r2=406625&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml Mon May 15 05:14:36 2006 @@ -18,7 +18,7 @@ <import plugin="lib-regex-filter"/> </requires> - <extension id="org.apache.nutch.net.urlfilter" + <extension id="org.apache.nutch.net.urlfilter.automaton" name="Nutch Automaton URL Filter" point="org.apache.nutch.net.URLFilter"> <implementation id="AutomatonURLFilter" Modified: lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml?rev=406625&r1=406624&r2=406625&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/urlfilter-prefix/plugin.xml Mon May 15 05:14:36 2006 @@ -16,7 +16,7 @@ <import plugin="nutch-extensionpoints"/> </requires> - <extension id="org.apache.nutch.net.urlfilter" + <extension id="org.apache.nutch.net.urlfilter.prefix" name="Nutch Prefix URL Filter" point="org.apache.nutch.net.URLFilter"> <implementation id="PrefixURLFilter" Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml?rev=406625&r1=406624&r2=406625&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/plugin.xml Mon May 15 05:14:36 2006 @@ -17,7 +17,7 @@ <import plugin="lib-regex-filter"/> </requires> - <extension id="org.apache.nutch.net.urlfilter" + <extension id="org.apache.nutch.net.urlfilter.regex" name="Nutch Regex URL Filter" point="org.apache.nutch.net.URLFilter"> <implementation id="RegexURLFilter" Added: lucene/nutch/trunk/src/plugin/urlfilter-suffix/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-suffix/build.xml?rev=406625&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-suffix/build.xml (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-suffix/build.xml Mon May 15 05:14:36 2006 @@ -0,0 +1,7 @@ +<?xml version="1.0"?> + +<project name="urlfilter-suffix" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Propchange: lucene/nutch/trunk/src/plugin/urlfilter-suffix/build.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/urlfilter-suffix/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-suffix/plugin.xml?rev=406625&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-suffix/plugin.xml (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-suffix/plugin.xml Mon May 15 05:14:36 2006 @@ -0,0 +1,32 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<plugin + id="urlfilter-suffix" + name="Suffix URL Filter" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlfilter-suffix.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlfilter.suffix" + name="Nutch Suffix URL Filter" + point="org.apache.nutch.net.URLFilter"> + <implementation id="SuffixURLFilter" + class="org.apache.nutch.urlfilter.suffix.SuffixURLFilter"/> + <!-- by default, attribute "file" is undefined, to keep classic behavior. + <implementation id="SuffixURLFilter" + class="org.apache.nutch.net.SuffixURLFilter"> + <parameter name="file" value="urlfilter-suffix.txt"/> + </implementation> + --> + </extension> + +</plugin> Propchange: lucene/nutch/trunk/src/plugin/urlfilter-suffix/plugin.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java?rev=406625&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java Mon May 15 05:14:36 2006 @@ -0,0 +1,280 @@ +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.urlfilter.suffix; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.LogFormatter; +import org.apache.nutch.net.*; + +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.SuffixStringMatcher; +import org.apache.nutch.util.TrieStringMatcher; + +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; + +import java.io.Reader; +import java.io.FileReader; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.IOException; + +import java.util.List; +import java.util.ArrayList; +import java.util.logging.Logger; + +/** + * Filters URLs based on a file of URL suffixes. The file is named by + * <ol> + * <li>property "urlfilter.suffix.file" in ./conf/nutch-default.xml, and</li> + * <li>attribute "file" in plugin.xml of this plugin</li> + * </ol> + * Attribute "file" has higher precedence if defined. If the config file is + * missing, all URLs will be rejected. + * + * <p>This filter can be configured to work in one of two modes: + * <ul> + * <li><b>default to reject</b> ('-'): in this mode, only URLs that match suffixes + * specified in the config file will be accepted, all other URLs will be + * rejected.</li> + * <li><b>default to accept</b> ('+'): in this mode, only URLs that match suffixes + * specified in the config file will be rejected, all other URLs will be + * accepted.</li> + * </ul> + * <p> + * The format of this config file is one URL suffix per line, with no preceding + * whitespace. Order, in which suffixes are specified, doesn't matter. Blank + * lines and comments (#) are allowed. + * </p> + * <p> + * A single '+' or '-' sign not followed by any suffix must be used once, to + * signify the mode this plugin operates in. An optional single 'I' can be appended, + * to signify that suffix matches should be case-insensitive. The default, if + * not specified, is to use case-sensitive matches, i.e. suffix '.JPG' + * does not match '.jpg'. + * </p> + * <p> + * NOTE: the format of this file is different from urlfilter-prefix, because + * that plugin doesn't support allowed/prohibited prefixes (only supports + * allowed prefixes). Please note that this plugin does not support regular + * expressions, it only accepts literal suffixes. I.e. a suffix "+*.jpg" is most + * probably wrong, you should use "+.jpg" instead. + * </p> + * <h4>Example 1</h4> + * <p> + * The configuration shown below will accept all URLs with '.html' or '.htm' + * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), + * and prohibit all other suffixes. + * <p> + * + * <pre> + * # this is a comment + * + * # prohibit all unknown, case-sensitive matching + * - + * + * # collect only HTML files. + * .html + * .htm + * </pre> + * + * </p> + * <h4>Example 2</h4> + * <p> + * The configuration shown below will accept all URLs except common graphical + * formats. + * <p> + * + * <pre> + * # this is a comment + * + * # allow all unknown, case-insensitive matching + * +I + * + * # prohibited suffixes + * .gif + * .png + * .jpg + * .jpeg + * .bmp + * </pre> + * + * </p> + * @author Andrzej Bialecki + */ +public class SuffixURLFilter implements URLFilter { + + private static final Logger LOG = LogFormatter.getLogger(SuffixURLFilter.class.getName()); + + // read in attribute "file" of this plugin. + private String attributeFile = null; + + private SuffixStringMatcher suffixes; + private boolean modeAccept = false; + + private boolean ignoreCase = false; + + private Configuration conf; + + public SuffixURLFilter() throws IOException { + + } + + public SuffixURLFilter(Reader reader) throws IOException { + readConfigurationFile(reader); + } + + public String filter(String url) { + if (url == null) return null; + String _url; + if (ignoreCase) + _url = url.toLowerCase(); + else _url = url; + String a = suffixes.shortestMatch(_url); + if (a == null) { + if (modeAccept) return url; + else return null; + } else { + if (modeAccept) return null; + else return url; + } + } + + public void readConfigurationFile(Reader reader) throws IOException { + + // handle missing config file + if (reader == null) { + LOG.warning("Missing urlfilter.suffix.file, all URLs will be rejected!"); + suffixes = new SuffixStringMatcher(new String[0]); + modeAccept = false; + ignoreCase = false; + return; + } + BufferedReader in = new BufferedReader(reader); + List aSuffixes = new ArrayList(); + boolean allow = false; + boolean ignore = false; + String line; + + while ((line = in.readLine()) != null) { + if (line.length() == 0) continue; + + char first = line.charAt(0); + switch (first) { + case ' ': + case '\n': + case '#': // skip blank & comment lines + break; + case '-': + allow = false; + if (line.length() > 1 && line.charAt(1) == 'I') + ignore = true; + break; + case '+': + allow = true; + if (line.length() > 1 && line.charAt(1) == 'I') + ignore = true; + break; + default: + aSuffixes.add(line); + } + } + if (ignore) { + for (int i = 0; i < aSuffixes.size(); i++) { + aSuffixes.set(i, ((String) aSuffixes.get(i)).toLowerCase()); + } + } + suffixes = new SuffixStringMatcher(aSuffixes); + modeAccept = allow; + ignoreCase = ignore; + } + + public static void main(String args[]) throws IOException { + + SuffixURLFilter filter; + if (args.length >= 1) + filter = new SuffixURLFilter(new FileReader(args[0])); + else { + filter = new SuffixURLFilter(); + filter.setConf(NutchConfiguration.create()); + } + + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + String line; + while ((line = in.readLine()) != null) { + String out = filter.filter(line); + if (out != null) { + System.out.println("ACCEPTED " + out); + } else { + System.out.println("REJECTED " + out); + } + } + } + + public void setConf(Configuration conf) { + this.conf = conf; + + String pluginName = "urlfilter-suffix"; + Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(URLFilter.class.getName()).getExtensions(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + if (extension.getDescriptor().getPluginId().equals(pluginName)) { + attributeFile = extension.getAttribute("file"); + break; + } + } + if (attributeFile != null && attributeFile.trim().equals("")) attributeFile = null; + if (attributeFile != null) { + LOG.info("Attribute \"file\" is defined for plugin " + pluginName + " as " + attributeFile); + } else { + // LOG.warning("Attribute \"file\" is not defined in plugin.xml for + // plugin "+pluginName); + } + + String file = conf.get("urlfilter.suffix.file"); + // attribute "file" takes precedence if defined + if (attributeFile != null) file = attributeFile; + Reader reader = conf.getConfResourceAsReader(file); + + try { + readConfigurationFile(reader); + } catch (IOException e) { + LOG.severe(e.getMessage()); + throw new RuntimeException(e.getMessage(), e); + } + } + + public Configuration getConf() { + return this.conf; + } + + public boolean isModeAccept() { + return modeAccept; + } + + public void setModeAccept(boolean modeAccept) { + this.modeAccept = modeAccept; + } + + public boolean isIgnoreCase() { + return ignoreCase; + } + + public void setIgnoreCase(boolean ignoreCase) { + this.ignoreCase = ignoreCase; + } +} \ No newline at end of file Propchange: lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java?rev=406625&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java Mon May 15 05:14:36 2006 @@ -0,0 +1,134 @@ +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.suffix; + +import java.io.IOException; +import java.io.StringReader; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; +import junit.textui.TestRunner; + + +/** + * JUnit test for <code>SuffixURLFilter</code>. + * + * @author Andrzej Bialecki + */ +public class TestSuffixURLFilter extends TestCase { + private static final String suffixes = + "# this is a comment\n" + + "\n" + + ".gif\n" + + ".jpg\n"; + + private static final String[] urls = new String[] { + "http://www.example.com/test.gif", + "http://www.example.com/TEST.GIF", + "http://www.example.com/test.jpg", + "http://www.example.com/test.JPG", + "http://www.example.com/test.html", + "http://www.example.com/test.HTML", + }; + + private static String[] urlsModeAccept = new String[] { + null, + urls[1], + null, + urls[3], + urls[4], + urls[5] + }; + + private static String[] urlsModeReject = new String[] { + urls[0], + null, + urls[2], + null, + null, + null + }; + + private static String[] urlsModeAcceptIgnoreCase = new String[] { + null, + null, + null, + null, + urls[4], + urls[5] + }; + + private static String[] urlsModeRejectIgnoreCase = new String[] { + urls[0], + urls[1], + urls[2], + urls[3], + null, + null + }; + + private SuffixURLFilter filter = null; + + public TestSuffixURLFilter(String testName) { + super(testName); + } + + public static Test suite() { + return new TestSuite(TestSuffixURLFilter.class); + } + + public static void main(String[] args) { + TestRunner.run(suite()); + } + + public void setUp() throws IOException { + filter = new SuffixURLFilter(new StringReader(suffixes)); + } + + public void testModeAccept() { + filter.setIgnoreCase(false); + filter.setModeAccept(true); + for (int i = 0; i < urls.length; i++) { + assertTrue(urlsModeAccept[i] == filter.filter(urls[i])); + } + } + + public void testModeReject() { + filter.setIgnoreCase(false); + filter.setModeAccept(false); + for (int i = 0; i < urls.length; i++) { + assertTrue(urlsModeReject[i] == filter.filter(urls[i])); + } + } + + public void testModeAcceptIgnoreCase() { + filter.setIgnoreCase(true); + filter.setModeAccept(true); + for (int i = 0; i < urls.length; i++) { + assertTrue(urlsModeAcceptIgnoreCase[i] == filter.filter(urls[i])); + } + } + + public void testModeRejectIgnoreCase() { + filter.setIgnoreCase(true); + filter.setModeAccept(false); + for (int i = 0; i < urls.length; i++) { + assertTrue(urlsModeRejectIgnoreCase[i] == filter.filter(urls[i])); + } + } + +} Propchange: lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java ------------------------------------------------------------------------------ svn:eol-style = native ------------------------------------------------------- Using Tomcat but need to do more? Need to support web services, security? Get stuff done quickly with pre-integrated technology to make your job easier Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642 _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs