http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java b/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java new file mode 100644 index 0000000..0be1e31 --- /dev/null +++ b/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.domain; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +public class TestDomainURLFilter { + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + @Test + public void testFilter() throws Exception { + + String domainFile = SAMPLES + SEPARATOR + "hosts.txt"; + Configuration conf = NutchConfiguration.create(); + DomainURLFilter domainFilter = new DomainURLFilter(domainFile); + domainFilter.setConf(conf); + Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org")); + Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org")); + Assert.assertNotNull(domainFilter.filter("http://www.apache.org")); + Assert.assertNull(domainFilter.filter("http://www.google.com")); + Assert.assertNull(domainFilter.filter("http://mail.yahoo.com")); + Assert.assertNotNull(domainFilter.filter("http://www.foobar.net")); + Assert.assertNotNull(domainFilter.filter("http://www.foobas.net")); + Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com")); + Assert.assertNotNull(domainFilter.filter("http://www.foobar.be")); + Assert.assertNull(domainFilter.filter("http://www.adobe.com")); + } + + @Test + public void testNoFilter() throws Exception { + // https://issues.apache.org/jira/browse/NUTCH-2189 + String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt"; + Configuration conf = NutchConfiguration.create(); + DomainURLFilter domainFilter = new DomainURLFilter(domainFile); + domainFilter.setConf(conf); + Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org")); + Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org")); + Assert.assertNotNull(domainFilter.filter("http://www.apache.org")); + Assert.assertNotNull(domainFilter.filter("http://www.google.com")); + Assert.assertNotNull(domainFilter.filter("http://mail.yahoo.com")); + Assert.assertNotNull(domainFilter.filter("http://www.foobar.net")); + Assert.assertNotNull(domainFilter.filter("http://www.foobas.net")); + Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com")); + Assert.assertNotNull(domainFilter.filter("http://www.foobar.be")); + Assert.assertNotNull(domainFilter.filter("http://www.adobe.com")); + } + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt b/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt new file mode 100644 index 0000000..2b88c3b --- /dev/null +++ b/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt @@ -0,0 +1,5 @@ +# comments start with the pound sign +net +apache.org +be +www.yahoo.com \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domainblacklist/build.xml b/nutch-plugins/urlfilter-domainblacklist/build.xml new file mode 100644 index 0000000..19ea483 --- /dev/null +++ b/nutch-plugins/urlfilter-domainblacklist/build.xml @@ -0,0 +1,28 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlfilter-domainblacklist" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="data" /> + </copy> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domainblacklist/ivy.xml b/nutch-plugins/urlfilter-domainblacklist/ivy.xml new file mode 100644 index 0000000..24d7606 --- /dev/null +++ b/nutch-plugins/urlfilter-domainblacklist/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../../ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domainblacklist/plugin.xml b/nutch-plugins/urlfilter-domainblacklist/plugin.xml new file mode 100644 index 0000000..04eee6e --- /dev/null +++ b/nutch-plugins/urlfilter-domainblacklist/plugin.xml @@ -0,0 +1,43 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlfilter-domainblacklist" + name="Domain Blacklist URL Filter" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlfilter-domainblacklist.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlfilter.domainblacklist" + name="Nutch Domain Blacklist URL Filter" + point="org.apache.nutch.net.URLFilter"> + <implementation id="DomainBlacklistURLFilter" + class="org.apache.nutch.urlfilter.domainblacklist.DomainBlacklistURLFilter"> + <parameter name="file" value="domainblacklist-urlfilter.txt"/> + </implementation> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domainblacklist/pom.xml b/nutch-plugins/urlfilter-domainblacklist/pom.xml new file mode 100644 index 0000000..a814579 --- /dev/null +++ b/nutch-plugins/urlfilter-domainblacklist/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlfilter-domainblacklist</artifactId> + <packaging>jar</packaging> + + <name>urlfilter-domainblacklist</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java new file mode 100644 index 0000000..37b1cdc --- /dev/null +++ b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.domainblacklist; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.LinkedHashSet; +import java.util.Set; + +import org.apache.commons.lang.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLFilter; +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.util.URLUtil; +import org.apache.nutch.util.domain.DomainSuffix; + +/** + * <p> + * Filters URLs based on a file containing domain suffixes, domain names, and + * hostnames. A url that matches one of the suffixes, domains, or hosts present + * in the file is filtered out. + * </p> + * + * <p> + * Urls are checked in order of domain suffix, domain name, and hostname against + * entries in the domain file. The domain file would be setup as follows with + * one entry per line: + * + * <pre> + * com apache.org www.apache.org + * </pre> + * + * <p> + * The first line is an example of a filter that would allow all .com domains. + * The second line allows all urls from apache.org and all of its subdomains + * such as lucene.apache.org and hadoop.apache.org. The third line would allow + * only urls from www.apache.org. There is no specific ordering to entries. The + * entries are from more general to more specific with the more general + * overridding the more specific. + * </p> + * + * The domain file defaults to domainblacklist-urlfilter.txt in the classpath + * but can be overridden using the: + * + * <ul> + * <ol> + * property "urlfilter.domainblacklist.file" in ./conf/nutch-*.xml, and + * </ol> + * <ol> + * attribute "file" in plugin.xml of this plugin + * </ol> + * </ul> + * + * the attribute "file" has higher precedence if defined. + */ +public class DomainBlacklistURLFilter implements URLFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(DomainBlacklistURLFilter.class); + + // read in attribute "file" of this plugin. + private static String attributeFile = null; + private Configuration conf; + private String domainFile = null; + private Set<String> domainSet = new LinkedHashSet<String>(); + + private void readConfiguration(Reader configReader) throws IOException { + + // read the configuration file, line by line + BufferedReader reader = new BufferedReader(configReader); + String line = null; + while ((line = reader.readLine()) != null) { + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { + // add non-blank lines and non-commented lines + domainSet.add(StringUtils.lowerCase(line.trim())); + } + } + } + + /** + * Default constructor. + */ + public DomainBlacklistURLFilter() { + + } + + /** + * Constructor that specifies the domain file to use. + * + * @param domainFile + * The domain file, overrides domainblacklist-urlfilter.text default. + * + * @throws IOException + */ + public DomainBlacklistURLFilter(String domainFile) { + this.domainFile = domainFile; + } + + /** + * Sets the configuration. + */ + public void setConf(Configuration conf) { + this.conf = conf; + + // get the extensions for domain urlfilter + String pluginName = "urlfilter-domainblacklist"; + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint(URLFilter.class.getName()).getExtensions(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + if (extension.getDescriptor().getPluginId().equals(pluginName)) { + attributeFile = extension.getAttribute("file"); + break; + } + } + + // handle blank non empty input + if (attributeFile != null && attributeFile.trim().equals("")) { + attributeFile = null; + } + + if (attributeFile != null) { + if (LOG.isInfoEnabled()) { + LOG.info("Attribute \"file\" is defined for plugin " + pluginName + + " as " + attributeFile); + } + } else { + if (LOG.isWarnEnabled()) { + LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " + + pluginName); + } + } + + // domain file and attribute "file" take precedence if defined + String file = conf.get("urlfilter.domainblacklist.file"); + String stringRules = conf.get("urlfilter.domainblacklist.rules"); + if (domainFile != null) { + file = domainFile; + } else if (attributeFile != null) { + file = attributeFile; + } + Reader reader = null; + if (stringRules != null) { // takes precedence over files + reader = new StringReader(stringRules); + } else { + reader = conf.getConfResourceAsReader(file); + } + try { + if (reader == null) { + reader = new FileReader(file); + } + readConfiguration(reader); + } catch (IOException e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } + + public Configuration getConf() { + return this.conf; + } + + public String filter(String url) { + try { + // match for suffix, domain, and host in that order. more general will + // override more specific + String domain = URLUtil.getDomainName(url).toLowerCase().trim(); + String host = URLUtil.getHost(url); + String suffix = null; + DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url); + if (domainSuffix != null) { + suffix = domainSuffix.getDomain(); + } + + if (domainSet.contains(suffix) || domainSet.contains(domain) + || domainSet.contains(host)) { + // Matches, filter! + return null; + } + + // doesn't match, allow + return url; + } catch (Exception e) { + + // if an error happens, allow the url to pass + LOG.error("Could not apply filter on url: " + url + "\n" + + org.apache.hadoop.util.StringUtils.stringifyException(e)); + return null; + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java new file mode 100644 index 0000000..1f0022c --- /dev/null +++ b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL filter plugin to exclude URLs by domain suffixes, domain names, and/or host names. + * See {@link org.apache.nutch.urlfilter.domain} for the counterpart (include only URLs + * matching host or domain). + */ +package org.apache.nutch.urlfilter.domainblacklist; + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java b/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java new file mode 100644 index 0000000..d253867 --- /dev/null +++ b/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.domainblacklist; + +import org.junit.Assert; +import org.junit.Test; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +public class TestDomainBlacklistURLFilter { + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + @Test + public void testFilter() throws Exception { + + String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt"; + Configuration conf = NutchConfiguration.create(); + DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter( + domainBlacklistFile); + domainBlacklistFilter.setConf(conf); + Assert.assertNull(domainBlacklistFilter.filter("http://lucene.apache.org")); + Assert.assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org")); + Assert.assertNull(domainBlacklistFilter.filter("http://www.apache.org")); + Assert.assertNotNull(domainBlacklistFilter.filter("http://www.google.com")); + Assert.assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com")); + Assert.assertNull(domainBlacklistFilter.filter("http://www.foobar.net")); + Assert.assertNull(domainBlacklistFilter.filter("http://www.foobas.net")); + Assert.assertNull(domainBlacklistFilter.filter("http://www.yahoo.com")); + Assert.assertNull(domainBlacklistFilter.filter("http://www.foobar.be")); + Assert.assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com")); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt b/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt new file mode 100644 index 0000000..2b88c3b --- /dev/null +++ b/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt @@ -0,0 +1,5 @@ +# comments start with the pound sign +net +apache.org +be +www.yahoo.com \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/README.md ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-ignoreexempt/README.md b/nutch-plugins/urlfilter-ignoreexempt/README.md new file mode 100644 index 0000000..d48b672 --- /dev/null +++ b/nutch-plugins/urlfilter-ignoreexempt/README.md @@ -0,0 +1,43 @@ +urlfilter-ignoreexempt +====================== + This plugin allows certain urls to be exempted when the external links are configured to be ignored. + This is useful when focused crawl is setup but some resources like static files are linked from CDNs (external domains). + +# How to enable ? +Add `urlfilter-ignoreexempt` value to `plugin.includes` property +```xml +<property> + <name>plugin.includes</name> + <value>protocol-http|urlfilter-(regex|ignoreexempt)...</value> +</property> +``` + +# How to configure rules? + +open `conf/db-ignore-external-exemptions.txt` and add the regex rules. + +## Format : + +The format is same same as `regex-urlfilter.txt`. + Each non-comment, non-blank line contains a regular expression + prefixed by '+' or '-'. The first matching pattern in the file + determines whether a URL is exempted or ignored. If no pattern + matches, the URL is ignored. + + +## Example : + + To exempt urls ending with image extensions, use this rule + +`+(?i)\.(jpg|png|gif)$` + + + +## Testing the Rules : + +After enabling the plugin and adding your rules to `conf/db-ignore-external-exemptions.txt`, run: + +`bin/nutch plugin urlfilter-ignoreexempt org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter http://yoururl.here` + + +This should print `true` for urls which are accepted by configured rules. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-ignoreexempt/build.xml b/nutch-plugins/urlfilter-ignoreexempt/build.xml new file mode 100644 index 0000000..105f551 --- /dev/null +++ b/nutch-plugins/urlfilter-ignoreexempt/build.xml @@ -0,0 +1,55 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlfilter-ignoreexempt" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Build compilation dependencies --> + <target name="deps-jar"> + <ant target="jar" inheritall="false" dir="../lib-regex-filter"/> + </target> + + <!-- Add compilation dependencies to classpath --> + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/lib-regex-filter/*.jar" /> + <include name="**/urlfilter-regex/*.jar" /> + </fileset> + <pathelement location="${nutch.root}/build/lib-regex-filter/test"/> + <pathelement location="${nutch.root}/build/urlfilter-regex/test"/> + </path> + + <!-- Compile test classes for dependencies --> + <target name="deps-test-compile"> + <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/> + <ant target="compile-test" inheritall="false" dir="../urlfilter-regex"/> + </target> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/> + <ant target="deploy" inheritall="false" dir="../urlfilter-regex"/> + </target> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="data" /> + </copy> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-ignoreexempt/ivy.xml b/nutch-plugins/urlfilter-ignoreexempt/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/urlfilter-ignoreexempt/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-ignoreexempt/plugin.xml b/nutch-plugins/urlfilter-ignoreexempt/plugin.xml new file mode 100644 index 0000000..4139ca4 --- /dev/null +++ b/nutch-plugins/urlfilter-ignoreexempt/plugin.xml @@ -0,0 +1,45 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlfilter-ignoreexempt" + name="External Domain Ignore Exemption" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlfilter-ignoreexempt.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + <import plugin="lib-regex-filter"/> + <import plugin="urlfilter-regex"/> + </requires> + + <extension id="org.apache.nutch.net.urlfilter.ignoreexempt" + name="Ignore Exemption Url Filter" + point="org.apache.nutch.net.URLExemptionFilter"> + <implementation id="ExemptionUrlFilter" + class="org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter"> + <parameter name="file" value="db-ignore-external-exemptions.txt"/> + </implementation> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-ignoreexempt/pom.xml b/nutch-plugins/urlfilter-ignoreexempt/pom.xml new file mode 100644 index 0000000..fd26587 --- /dev/null +++ b/nutch-plugins/urlfilter-ignoreexempt/pom.xml @@ -0,0 +1,45 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlfilter-ignoreexempt</artifactId> + <packaging>jar</packaging> + + <name>urlfilter-ignoreexempt</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + + <dependencies> + <dependency> + <groupId>org.apache.nutch</groupId> + <artifactId>urlfilter-regex</artifactId> + <version>${project.parent.version}</version> + </dependency> + </dependencies> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java new file mode 100644 index 0000000..bbac300 --- /dev/null +++ b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.ignoreexempt; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLExemptionFilter; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.urlfilter.regex.RegexURLFilter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.util.Arrays; +import java.util.regex.Pattern; +import java.util.List; +import java.util.ArrayList; + + +/** + * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} uses regex configuration + * to check if URL is eligible for exemption from 'db.ignore.external'. + * When this filter is enabled, the external urls will be checked against configured sequence of regex rules. + *<p> + * The exemption rule file defaults to db-ignore-external-exemptions.txt in the classpath but can be + * overridden using the property <code>"db.ignore.external.exemptions.file" in ./conf/nutch-*.xml</code> + *</p> + * + * The exemption rules are specified in plain text file where each line is a rule. + * The format is same same as `regex-urlfilter.txt`. + * Each non-comment, non-blank line contains a regular expression + * prefixed by '+' or '-'. The first matching pattern in the file + * determines whether a URL is exempted or ignored. If no pattern + * matches, the URL is ignored. + * + * @since Feb 10, 2016 + * @version 1 + * @see org.apache.nutch.net.URLExemptionFilter + * @see org.apache.nutch.urlfilter.regex.RegexURLFilter + */ +public class ExemptionUrlFilter extends RegexURLFilter + implements URLExemptionFilter { + + public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE + = "db.ignore.external.exemptions.file"; + private static final Logger LOG = + LoggerFactory.getLogger(ExemptionUrlFilter.class); + + private List<Pattern> exemptions; + private Configuration conf; + + public List<Pattern> getExemptions() { + return exemptions; + } + + @Override + public boolean filter(String fromUrl, String toUrl) { + //this implementation does not consider fromUrl param. + //the regex rules are applied to toUrl. + return this.filter(toUrl) != null; + } + + /** + * Gets reader for regex rules + */ + protected Reader getRulesReader(Configuration conf) + throws IOException { + String fileRules = conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE); + return conf.getConfResourceAsReader(fileRules); + } + + public static void main(String[] args) { + + if (args.length != 1) { + System.out.println("Error: Invalid Args"); + System.out.println("Usage: " + + ExemptionUrlFilter.class.getName() + " <url>"); + return; + } + String url = args[0]; + ExemptionUrlFilter instance = new ExemptionUrlFilter(); + instance.setConf(NutchConfiguration.create()); + System.out.println(instance.filter(null, url)); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java new file mode 100644 index 0000000..ee949c5 --- /dev/null +++ b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL filter plugin which identifies exemptions to external urls when + * when external urls are set to ignore. + * + */ +package org.apache.nutch.urlfilter.ignoreexempt; + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-prefix/build.xml b/nutch-plugins/urlfilter-prefix/build.xml new file mode 100644 index 0000000..33faa48 --- /dev/null +++ b/nutch-plugins/urlfilter-prefix/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlfilter-prefix" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-prefix/ivy.xml b/nutch-plugins/urlfilter-prefix/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/urlfilter-prefix/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-prefix/plugin.xml b/nutch-plugins/urlfilter-prefix/plugin.xml new file mode 100644 index 0000000..22cfcaf --- /dev/null +++ b/nutch-plugins/urlfilter-prefix/plugin.xml @@ -0,0 +1,47 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlfilter-prefix" + name="Prefix URL Filter" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlfilter-prefix.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlfilter.prefix" + name="Nutch Prefix URL Filter" + point="org.apache.nutch.net.URLFilter"> + <implementation id="PrefixURLFilter" + class="org.apache.nutch.urlfilter.prefix.PrefixURLFilter"/> + <!-- by default, attribute "file" is undefined, to keep classic behavior. + <implementation id="PrefixURLFilter" + class="org.apache.nutch.net.PrefixURLFilter"> + <parameter name="file" value="urlfilter-prefix.txt"/> + </implementation> + --> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-prefix/pom.xml b/nutch-plugins/urlfilter-prefix/pom.xml new file mode 100644 index 0000000..65ad019 --- /dev/null +++ b/nutch-plugins/urlfilter-prefix/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlfilter-prefix</artifactId> + <packaging>jar</packaging> + + <name>urlfilter-prefix</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java new file mode 100644 index 0000000..2e955b5 --- /dev/null +++ b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java @@ -0,0 +1,178 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.prefix; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.*; + +import org.apache.nutch.util.PrefixStringMatcher; +import org.apache.nutch.util.TrieStringMatcher; + +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; + +import java.io.Reader; +import java.io.FileReader; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.IOException; +import java.io.StringReader; + +import java.util.List; +import java.util.ArrayList; + +/** + * Filters URLs based on a file of URL prefixes. The file is named by (1) + * property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and (2) + * attribute "file" in plugin.xml of this plugin Attribute "file" has higher + * precedence if defined. + * + * <p> + * The format of this file is one URL prefix per line. + * </p> + */ +public class PrefixURLFilter implements URLFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(PrefixURLFilter.class); + + // read in attribute "file" of this plugin. + private static String attributeFile = null; + + private TrieStringMatcher trie; + + private Configuration conf; + + public PrefixURLFilter() throws IOException { + + } + + public PrefixURLFilter(String stringRules) throws IOException { + trie = readConfiguration(new StringReader(stringRules)); + } + + public String filter(String url) { + if (trie.shortestMatch(url) == null) + return null; + else + return url; + } + + private TrieStringMatcher readConfiguration(Reader reader) throws IOException { + + BufferedReader in = new BufferedReader(reader); + List<String> urlprefixes = new ArrayList<String>(); + String line; + + while ((line = in.readLine()) != null) { + if (line.length() == 0) + continue; + + char first = line.charAt(0); + switch (first) { + case ' ': + case '\n': + case '#': // skip blank & comment lines + continue; + default: + urlprefixes.add(line); + } + } + + return new PrefixStringMatcher(urlprefixes); + } + + public static void main(String args[]) throws IOException { + + PrefixURLFilter filter; + if (args.length >= 1) + filter = new PrefixURLFilter(args[0]); + else + filter = new PrefixURLFilter(); + + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + String line; + while ((line = in.readLine()) != null) { + String out = filter.filter(line); + if (out != null) { + System.out.println(out); + } + } + } + + public void setConf(Configuration conf) { + this.conf = conf; + + String pluginName = "urlfilter-prefix"; + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint(URLFilter.class.getName()).getExtensions(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + if (extension.getDescriptor().getPluginId().equals(pluginName)) { + attributeFile = extension.getAttribute("file"); + break; + } + } + if (attributeFile != null && attributeFile.trim().equals("")) + attributeFile = null; + if (attributeFile != null) { + if (LOG.isInfoEnabled()) { + LOG.info("Attribute \"file\" is defined for plugin " + pluginName + + " as " + attributeFile); + } + } else { + // if (LOG.isWarnEnabled()) { + // LOG.warn("Attribute \"file\" is not defined in plugin.xml for + // plugin "+pluginName); + // } + } + + String file = conf.get("urlfilter.prefix.file"); + String stringRules = conf.get("urlfilter.prefix.rules"); + // attribute "file" takes precedence if defined + if (attributeFile != null) + file = attributeFile; + Reader reader = null; + if (stringRules != null) { // takes precedence over files + reader = new StringReader(stringRules); + } else { + reader = conf.getConfResourceAsReader(file); + } + + if (reader == null) { + trie = new PrefixStringMatcher(new String[0]); + } else { + try { + trie = readConfiguration(reader); + } catch (IOException e) { + if (LOG.isErrorEnabled()) { + LOG.error(e.getMessage()); + } + // TODO [email protected]: throw Exception? Because broken api. + throw new RuntimeException(e.getMessage(), e); + } + } + } + + public Configuration getConf() { + return this.conf; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html new file mode 100644 index 0000000..dbed0be --- /dev/null +++ b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html @@ -0,0 +1,5 @@ +<html> +<body> +<p>URL filter plugin to include only URLs which match one of a given list of URL prefixes.</p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java b/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java new file mode 100644 index 0000000..b7a7ce4 --- /dev/null +++ b/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.prefix; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; +import junit.textui.TestRunner; + +import java.io.IOException; + + +/** + * JUnit test for <code>PrefixURLFilter</code>. + * + * @author Talat Uyarer + * @author Cihad Guzel + */ +public class TestPrefixURLFilter extends TestCase { + private static final String prefixes = + "# this is a comment\n" + + "\n" + + "http://\n" + + "https://\n" + + "file://\n" + + "ftp://\n"; + + private static final String[] urls = new String[] { + "http://www.example.com/", + "https://www.example.com/", + "ftp://www.example.com/", + "file://www.example.com/", + "abcd://www.example.com/", + "www.example.com/", + }; + + private static String[] urlsModeAccept = new String[] { + urls[0], + urls[1], + urls[2], + urls[3], + null, + null + }; + + private PrefixURLFilter filter = null; + + public static Test suite() { + return new TestSuite(TestPrefixURLFilter.class); + } + + public static void main(String[] args) { + TestRunner.run(suite()); + } + + public void setUp() throws IOException { + filter = new PrefixURLFilter(prefixes); + } + + public void testModeAccept() { + for (int i = 0; i < urls.length; i++) { + assertTrue(urlsModeAccept[i] == filter.filter(urls[i])); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/build.xml b/nutch-plugins/urlfilter-regex/build.xml new file mode 100644 index 0000000..5b80d08 --- /dev/null +++ b/nutch-plugins/urlfilter-regex/build.xml @@ -0,0 +1,51 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlfilter-regex" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Build compilation dependencies --> + <target name="deps-jar"> + <ant target="jar" inheritall="false" dir="../lib-regex-filter"/> + </target> + + <!-- Add compilation dependencies to classpath --> + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/lib-regex-filter/*.jar" /> + </fileset> + <pathelement location="${nutch.root}/build/lib-regex-filter/test"/> + </path> + + <!-- Compile test classes for dependencies --> + <target name="deps-test-compile"> + <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/> + </target> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/> + </target> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="sample" includes="**/*.rules, **/*.urls"/> + </copy> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/ivy.xml b/nutch-plugins/urlfilter-regex/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/urlfilter-regex/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/plugin.xml b/nutch-plugins/urlfilter-regex/plugin.xml new file mode 100644 index 0000000..34f4a91 --- /dev/null +++ b/nutch-plugins/urlfilter-regex/plugin.xml @@ -0,0 +1,48 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlfilter-regex" + name="Regex URL Filter" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlfilter-regex.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + <import plugin="lib-regex-filter"/> + </requires> + + <extension id="org.apache.nutch.net.urlfilter.regex" + name="Nutch Regex URL Filter" + point="org.apache.nutch.net.URLFilter"> + <implementation id="RegexURLFilter" + class="org.apache.nutch.urlfilter.regex.RegexURLFilter"/> + <!-- by default, attribute "file" is undefined, to keep classic behavior. + <implementation id="RegexURLFilter" + class="org.apache.nutch.net.RegexURLFilter"> + <parameter name="file" value="urlfilter-regex.txt"/> + </implementation> + --> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/pom.xml b/nutch-plugins/urlfilter-regex/pom.xml new file mode 100644 index 0000000..db9e7bd --- /dev/null +++ b/nutch-plugins/urlfilter-regex/pom.xml @@ -0,0 +1,53 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlfilter-regex</artifactId> + <packaging>jar</packaging> + + <name>urlfilter-regex</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + + <dependencies> + <dependency> + <groupId>org.apache.nutch</groupId> + <artifactId>lib-regex-filter</artifactId> + <version>${project.parent.version}</version> + </dependency> + <dependency> + <groupId>org.apache.nutch</groupId> + <artifactId>lib-regex-filter</artifactId> + <version>${project.parent.version}</version> + <scope>test</scope> + <type>test-jar</type> + </dependency> + </dependencies> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java new file mode 100644 index 0000000..2988114 --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java @@ -0,0 +1,111 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.regex; + +// JDK imports +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.urlfilter.api.RegexRule; +import org.apache.nutch.urlfilter.api.RegexURLFilterBase; +import org.apache.nutch.util.NutchConfiguration; + +/** + * Filters URLs based on a file of regular expressions using the + * {@link java.util.regex Java Regex implementation}. + */ +public class RegexURLFilter extends RegexURLFilterBase { + + public static final String URLFILTER_REGEX_FILE = "urlfilter.regex.file"; + public static final String URLFILTER_REGEX_RULES = "urlfilter.regex.rules"; + + public RegexURLFilter() { + super(); + } + + public RegexURLFilter(String filename) throws IOException, + PatternSyntaxException { + super(filename); + } + + RegexURLFilter(Reader reader) throws IOException, IllegalArgumentException { + super(reader); + } + + /* + * ----------------------------------- * <implementation:RegexURLFilterBase> * + * ----------------------------------- + */ + + /** + * Rules specified as a config property will override rules specified as a + * config file. + */ + protected Reader getRulesReader(Configuration conf) throws IOException { + String stringRules = conf.get(URLFILTER_REGEX_RULES); + if (stringRules != null) { + return new StringReader(stringRules); + } + String fileRules = conf.get(URLFILTER_REGEX_FILE); + return conf.getConfResourceAsReader(fileRules); + } + + // Inherited Javadoc + protected RegexRule createRule(boolean sign, String regex) { + return new Rule(sign, regex); + } + + protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) { + return new Rule(sign, regex, hostOrDomain); + } + + + + /* + * ------------------------------------ * </implementation:RegexURLFilterBase> + * * ------------------------------------ + */ + + public static void main(String args[]) throws IOException { + RegexURLFilter filter = new RegexURLFilter(); + filter.setConf(NutchConfiguration.create()); + main(filter, args); + } + + private class Rule extends RegexRule { + + private Pattern pattern; + + Rule(boolean sign, String regex) { + this(sign, regex, null); + } + + Rule(boolean sign, String regex, String hostOrDomain) { + super(sign, regex, hostOrDomain); + pattern = Pattern.compile(regex); + } + + protected boolean match(String url) { + return pattern.matcher(url).find(); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html new file mode 100644 index 0000000..7acf73b --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html @@ -0,0 +1,5 @@ +<html> +<body> +<p>URL filter plugin to include and/or exclude URLs matching Java regular expressions.</p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java b/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java new file mode 100644 index 0000000..b86181e --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.regex; + +// JDK imports +import java.io.IOException; +import java.io.Reader; + +import org.apache.nutch.net.*; +// Nutch imports +import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest; +import org.junit.Assert; +import org.junit.Test; + +/** + * JUnit based test of class <code>RegexURLFilter</code>. + * + * @author Jérôme Charron + */ +public class TestRegexURLFilter extends RegexURLFilterBaseTest { + + protected URLFilter getURLFilter(Reader rules) { + try { + return new RegexURLFilter(rules); + } catch (IOException e) { + Assert.fail(e.toString()); + return null; + } + } + + @Test + public void test() { + test("WholeWebCrawling"); + test("IntranetCrawling"); + bench(50, "Benchmarks"); + bench(100, "Benchmarks"); + bench(200, "Benchmarks"); + bench(400, "Benchmarks"); + bench(800, "Benchmarks"); + } + + @Test + public void test1838() { + test("nutch1838"); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules new file mode 100644 index 0000000..c8901e2 --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules @@ -0,0 +1,26 @@ +# The url filter file used by the crawl command. + +# Better for intranet crawling. +# Be sure to change MY.DOMAIN.NAME to your domain name. + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file:, ftp:, & mailto: urls +-^(file|ftp|mailto): + +# skip image and other suffixes we can't yet parse +-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$ + +# skip URLs containing certain characters as probable queries, etc. +-[?*!@=] + +# skip .fr .org and .net domains +-^.*//.*\.fr/ +-^.*//.*\.org/ +-^.*//.*\.net/ + +# skip everything else ++. http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls new file mode 100644 index 0000000..40bf4ee --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls @@ -0,0 +1,297 @@ ++http://www.hostip.info/ +-http://www.elanceur.org/Articles/OntologieSurfaite.html ++http://www.opensymphony.com/quartz/ +-http://www.portletbridge.org/saxbenchmark/index.html ++http://www.lesmotsdelinfo.com/ ++http://usefulinc.com/doap/ ++http://www.codezoo.com/ ++http://search.infocious.com/ +-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html ++http://www.brics.dk/%7Eamoeller/automaton/ ++http://jazzz.com/wp.html ++http://www.maxkiesler.com/index.php ++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html ++http://www.alias-i.com/lingpipe/ +-http://johnny.ihackstuff.com/index.php?module=prodreviews +-http://www.spurl.net/ ++http://www.dropload.com/ ++http://vivisimo.com/ ++http://www.marumushi.com/apps/newsmap/newsmap.cfm ++http://www.ixquick.com/ +-http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html ++http://www.mail-archive.com/ ++http://www.spymac.com/ +-http://browsers.evolt.org/ +-http://www.oswd.org/ ++http://www.stayinvisible.com/index.pl ++http://java.sun.com/j2se/1.4.2/docs/api/index.html ++http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx ++http://www.bloglines.com/ +-http://www.fckeditor.net/ ++http://search.msn.com/ +-http://www.grub.org/ ++http://www.xml.com/pub/a/2000/11/29/schemas/part1.html +-http://www.mnot.net/cache_docs/ +-http://www.furl.net/ ++http://www.blogpulse.com/ ++http://www.googlefight.com/ ++http://www.rokulabs.com/ +-http://mightylegends.zapto.org/dvd/dvdauthor_howto.php +-http://www.batbox.org/wrt54g-linux.html +-http://en.wikipedia.org/wiki/%s ++http://www.sipcenter.com/ ++http://www.merriampark.com/ld.htm ++http://anon.inf.tu-dresden.de/index_en.html ++http://www.pluck.com/ ++http://www.tiddlywiki.com/ ++http://www.jux2.com/ ++http://clusty.com/ +-http://findability.org/ ++http://www.searchengineshowdown.com/ ++http://www.nhacks.com/email/index.php ++http://www.koders.com/ ++http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf ++http://www.gmailwiki.com/index.php/Main_Page ++http://www.tadalist.com/ ++http://www.net2ftp.com/ ++http://www.streamload.com/ ++http://www.lucazappa.com/brilliantMaker/buttonImage.php ++http://www.hybernaut.com/bdv/delicious-import.html ++http://www.gtmcknight.com/buttons/ ++http://amb.vis.ne.jp/mozilla/scrapbook/ ++http://g-metrics.com/index.php +-http://tor.eff.org/ ++http://www.search-this.com/search_engine_decoder.asp ++http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html ++http://www.adaptivepath.com/publications/essays/archives/000385.php +-http://isnoop.net/gmail/ +-http://openweb.eu.org/ ++http://www.mistergooddeal.com/ ++http://javatoolbox.com/ +-http://www.freenews.fr/ ++http://www.wikiwax.com/ +-http://today.java.net/pub/a/today/2005/04/21/farm.html ++http://users.skynet.be/J.Beever/pave.htm ++http://www.lundi8h.com/ ++http://www.snap.com/ ++http://www.goosee.com/puppy/index.shtml +-http://www.softwarefreedom.org/index.html +-http://y.20q.net/ ++http://www.bitty.com/ ++http://www.lafraise.com/ +-http://www.liquidinformation.org/ ++http://www.searchtools.com/ ++http://www.martinfowler.com/articles/injection.html ++http://pdos.csail.mit.edu/scigen/ +-http://developer.yahoo.net/blog/ ++http://blogger-templates.blogspot.com/ ++http://phpadsnew.com/two/ ++http://www.langreiter.com/exec/yahoo-vs-google.html +-http://www.dataparksearch.org/ +-http://www.yubnub.org/ +-http://www.fing.org/ +-http://www.swish-e.org/ +-http://www.openajax.net/wordpress/ ++http://crypto.stanford.edu/PwdHash/ ++http://www.html-kit.com/favicon/ +-http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1 ++http://www.durhamtownship.com/ ++http://jiwire.com/ ++http://www.insilmaril.de/vym/ +-http://www.spreadshirt.net/ ++http://www.goffice.com/ ++http://www.writely.com/ ++http://www.milindparikh.com/ ++http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html ++http://www.wikyblog.com/Map/Guest/Home +-http://www.kottke.org/05/08/googleos-webos ++http://www.rollyo.com/ ++http://www.meebo.com/ ++http://www.factbites.com/ ++http://www.placeopedia.com/ ++http://swoogle.umbc.edu/ ++http://www.viaduc.com/ +-http://demo.wikiwyg.net/wikiwyg/demo/standalone/ ++http://podcasts.yahoo.com/ +-http://beaglewiki.org/Main_Page ++http://yq.search.yahoo.com/ +-http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1 ++http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html ++http://socialight.com/ ++http://www.lexxe.com/ ++http://www.xom.nu/ ++http://www.turboprint.de/ ++http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27 ++http://www.wi-fiplanet.com/tutorials/article.php/3562391 ++http://particletree.com/features/10-tips-to-a-better-form/ ++http://www.songbirdnest.com/ +-http://www.w3.org/Talks/Tools/Slidy/ +-http://www.compassframework.org/display/SITE/Home ++http://motrech.blogspot.com/ ++http://www.moteurzine.com/ ++http://www.mex-search.com/ +-http://beta.previewseek.com/?mdc=y&twin=n&ilang=french ++http://www.goshme.com/ ++http://rialto.application-servers.com/ ++http://www.multe-pass.com/ ++http://www.tailrank.com/ ++http://www.vandertramp.com/INTERNETDOWN/ ++http://www.letterjames.de/index.html ++http://code.google.com/index.html ++http://www.kritx.com/ ++http://performancing.com/firefox ++http://www.mywebsearch.com/ +-http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1 ++http://www.lukew.com/resources/articles/blogs2.asp +-http://www.hyperwords.net/ ++http://ajax.parish.ath.cx/translator/ ++http://www.maplandia.com/ +-http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages ++http://onefeed.com/index.php ++http://www.file-swap.com/ +-http://opennlp.org/ ++http://mindprod.com/jgloss/encoding.html ++http://code.google.com/webstats/index.html ++http://www.freeweb-hosting.com/google_pagerank_pr_checker/ +-http://www.framakey.org/ +-http://microformats.org/wiki/hreview +-http://www.ashesandsnow.org/index2.html +-http://uima-framework.sourceforge.net/ ++http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html +-http://www.anandtech.com/IT/showdoc.aspx?i=2523&p=2 ++http://fr.techcrunch.com/ +-http://developer.yahoo.net/yui/ ++http://www.fredrikodman.com/ ++http://www.mpirical.com/companion/mpirical_companion.html ++http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html +-http://k9copy.free.fr/ +-http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3 +-http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design +-http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2 ++http://blogokat.canalblog.com/archives/2005/11/02/882454.html ++http://robur.slu.se/jensl/xmlclitools/ +-http://www.internetactu.net/?p=6291 +-http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1 ++http://www.memodata.com/2004/fr/alexandria/ +-http://presse-citron.net/?2006/01/23/654-joomla-pete-grave ++http://www.randomerror.com/ ++http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/ +-http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395 +-http://interstices.info/display.jsp?id=c_15918 ++http://www.tech-invite.com/ ++http://www.croczilla.com/zap +-http://www.libervis.com/modules/wordpress/?p=13 ++http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/ +-http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm ++http://www.influo.com/ ++http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html +-http://www.addnb.org/fr/docs/webinvisible.htm +-http://manhack.net/ +-http://www.jibaku.net/ ++http://www.pipologie.com/ ++http://christophenoel.blogspot.com/ +-http://www.seekport.fr/seekbot/ ++http://beta.exalead.com/ +-http://www.boolgum.fr/index.html ++http://www.kesako.canalblog.com/ ++http://loran.blogspot.com/ ++http://outils-recherche.blogspot.com/ ++http://www.art-dept.com/artists/giacobbe/ ++http://www.meggould.netfirms.com/site_seeingIII.htm ++http://www.freedpi.com/ ++http://www.frenchfred.com/ ++http://www.photoways.com/ +-http://freco.free.fr/index.htm +-http://triturages.free.fr/index.htm +-http://www.qsos.org/ ++http://www.alvis.info/alvis/ ++http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/ +-http://www.shinux.org/ ++http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml ++http://www.kurobox.com/online/tiki-index.php +-http://news.gmane.org/gmane.comp.misc.linkstation.linux ++http://www.imsbook.com/SIP-IMS-Standards-List.html +-http://incubator.apache.org/directory/subprojects/snickers/ +-http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html +-http://sourceforge.net/projects/cryptix-asn1/ +-http://sourceforge.net/projects/basn/ +-http://asn1.elibel.tm.fr/fr/index.htm +-http://sourceforge.net/projects/a2j/ ++http://www.degrouptest.com/ ++http://interstices.info/ ++http://louvre-boite.viabloga.com/news/18.shtml +-http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html ++http://poiplace.oabsoftware.nl/ +-http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759 +-http://www.yoono.com/favorites.jsp?user-id=lquerel +-http://www.librecours.org/cgi-bin/main +-http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1 +-http://limo.sourceforge.net/ ++http://www-scf.usc.edu/%7Emattmann/ ++http://spaces.msn.com/members/famillezen/ +-http://photos.joune.org/ +-http://www.canon.fr/paperart/ ++http://flash.eastweb.ru/files/20051024092150.swf ++http://www.xsltwiki.com/index.php/Main_Page ++http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/ +-http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31 ++http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html +-http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/ ++http://www.aeliosfinance.com/ ++http://www.capital-it.com/ +-http://www.tradedoubler.fr/pan/public/solutions/publisher +-http://www.recherche.gouv.fr/technologie/concours/2006/index.htm ++http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/ ++http://wanabo.com/ +-http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1 +-http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam ++http://aeliosfinance.com/ ++http://www.centreincubation.com/ ++http://www.franceincubation.com/ +-http://www.oseo.fr/ ++http://www.i18nfaq.com/chardet.html +-http://cpdetector.sourceforge.net/ ++http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles ++http://chezlorry.ca/Accueil.htm ++http://cetnia.blogs.com/d_lires/ +-http://www.directwine.fr/ ++http://www.new-phenix.com/ +-http://upnp.sourceforge.net/ +-http://www.pixmania.fr/ +-http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3 ++http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/ ++http://www.stepnewz.com/sn/default.asp ++http://opquast.com/ +-http://www.freeplayer.org/ +-http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie +-http://atomcomputer.free.fr/fbox/ +-http://www.internetactu.net/index.php?p=6100 +-http://mammouthland.free.fr/cours/css/genecss.php +-http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1 ++http://www-106.ibm.com/developerworks/xml/library/x-xapi.html +-http://xml.apache.org/xalan-j/extensions.html ++http://developers.sun.com/foryourbusiness/jcc/ ++http://blogs.sun.com/roller/page/roumen/Weblog +-http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1 +-http://blog.developpez.com/index.php?blog=51&p=1389&more=1&c=1&tb=1&pb=1 ++http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/ ++http://odur.let.rug.nl/%7Evannoord/ +-http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html +-http://artist.inist.fr/ ++http://www.elra.info/ +-http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO ++http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability ++http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval ++http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/ ++http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/ ++http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/ ++http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/ ++http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/ ++http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html +-http://www.lexique.org/ ++http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/ ++http://www.streamium.com/products/mx6000i/ +-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&scy=FR&slg=fr +-http://store.interact-tv.com/store/product_info.php?cPath=9&products_id=73 ++http://www.tversity.com/ +-http://www.aspseek.org/index.php \ No newline at end of file
