http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java new file mode 100644 index 0000000..2988114 --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java @@ -0,0 +1,111 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.regex; + +// JDK imports +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.urlfilter.api.RegexRule; +import org.apache.nutch.urlfilter.api.RegexURLFilterBase; +import org.apache.nutch.util.NutchConfiguration; + +/** + * Filters URLs based on a file of regular expressions using the + * {@link java.util.regex Java Regex implementation}. + */ +public class RegexURLFilter extends RegexURLFilterBase { + + public static final String URLFILTER_REGEX_FILE = "urlfilter.regex.file"; + public static final String URLFILTER_REGEX_RULES = "urlfilter.regex.rules"; + + public RegexURLFilter() { + super(); + } + + public RegexURLFilter(String filename) throws IOException, + PatternSyntaxException { + super(filename); + } + + RegexURLFilter(Reader reader) throws IOException, IllegalArgumentException { + super(reader); + } + + /* + * ----------------------------------- * <implementation:RegexURLFilterBase> * + * ----------------------------------- + */ + + /** + * Rules specified as a config property will override rules specified as a + * config file. + */ + protected Reader getRulesReader(Configuration conf) throws IOException { + String stringRules = conf.get(URLFILTER_REGEX_RULES); + if (stringRules != null) { + return new StringReader(stringRules); + } + String fileRules = conf.get(URLFILTER_REGEX_FILE); + return conf.getConfResourceAsReader(fileRules); + } + + // Inherited Javadoc + protected RegexRule createRule(boolean sign, String regex) { + return new Rule(sign, regex); + } + + protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) { + return new Rule(sign, regex, hostOrDomain); + } + + + + /* + * ------------------------------------ * </implementation:RegexURLFilterBase> + * * ------------------------------------ + */ + + public static void main(String args[]) throws IOException { + RegexURLFilter filter = new RegexURLFilter(); + filter.setConf(NutchConfiguration.create()); + main(filter, args); + } + + private class Rule extends RegexRule { + + private Pattern pattern; + + Rule(boolean sign, String regex) { + this(sign, regex, null); + } + + Rule(boolean sign, String regex, String hostOrDomain) { + super(sign, regex, hostOrDomain); + pattern = Pattern.compile(regex); + } + + protected boolean match(String url) { + return pattern.matcher(url).find(); + } + } + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html new file mode 100644 index 0000000..7acf73b --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html @@ -0,0 +1,5 @@ +<html> +<body> +<p>URL filter plugin to include and/or exclude URLs matching Java regular expressions.</p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java b/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java new file mode 100644 index 0000000..b86181e --- /dev/null +++ b/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.regex; + +// JDK imports +import java.io.IOException; +import java.io.Reader; + +import org.apache.nutch.net.*; +// Nutch imports +import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest; +import org.junit.Assert; +import org.junit.Test; + +/** + * JUnit based test of class <code>RegexURLFilter</code>. + * + * @author Jérôme Charron + */ +public class TestRegexURLFilter extends RegexURLFilterBaseTest { + + protected URLFilter getURLFilter(Reader rules) { + try { + return new RegexURLFilter(rules); + } catch (IOException e) { + Assert.fail(e.toString()); + return null; + } + } + + @Test + public void test() { + test("WholeWebCrawling"); + test("IntranetCrawling"); + bench(50, "Benchmarks"); + bench(100, "Benchmarks"); + bench(200, "Benchmarks"); + bench(400, "Benchmarks"); + bench(800, "Benchmarks"); + } + + @Test + public void test1838() { + test("nutch1838"); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-suffix/build.xml b/nutch-plugins/urlfilter-suffix/build.xml new file mode 100644 index 0000000..e5382c6 --- /dev/null +++ b/nutch-plugins/urlfilter-suffix/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlfilter-suffix" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-suffix/ivy.xml b/nutch-plugins/urlfilter-suffix/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/urlfilter-suffix/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-suffix/plugin.xml b/nutch-plugins/urlfilter-suffix/plugin.xml new file mode 100644 index 0000000..f326d15 --- /dev/null +++ b/nutch-plugins/urlfilter-suffix/plugin.xml @@ -0,0 +1,47 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlfilter-suffix" + name="Suffix URL Filter" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlfilter-suffix.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlfilter.suffix" + name="Nutch Suffix URL Filter" + point="org.apache.nutch.net.URLFilter"> + <implementation id="SuffixURLFilter" + class="org.apache.nutch.urlfilter.suffix.SuffixURLFilter"/> + <!-- by default, attribute "file" is undefined, to keep classic behavior. + <implementation id="SuffixURLFilter" + class="org.apache.nutch.net.SuffixURLFilter"> + <parameter name="file" value="urlfilter-suffix.txt"/> + </implementation> + --> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-suffix/pom.xml b/nutch-plugins/urlfilter-suffix/pom.xml new file mode 100644 index 0000000..82023c6 --- /dev/null +++ b/nutch-plugins/urlfilter-suffix/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlfilter-suffix</artifactId> + <packaging>jar</packaging> + + <name>urlfilter-suffix</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java new file mode 100644 index 0000000..39c541f --- /dev/null +++ b/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java @@ -0,0 +1,331 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.urlfilter.suffix; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.*; + +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.SuffixStringMatcher; + +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Reader; +import java.io.FileReader; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.IOException; +import java.io.StringReader; + +import java.util.List; +import java.util.ArrayList; + +import java.net.URL; +import java.net.MalformedURLException; + +/** + * Filters URLs based on a file of URL suffixes. The file is named by + * <ol> + * <li>property "urlfilter.suffix.file" in ./conf/nutch-default.xml, and</li> + * <li>attribute "file" in plugin.xml of this plugin</li> + * </ol> + * Attribute "file" has higher precedence if defined. If the config file is + * missing, all URLs will be rejected. + * + * <p> + * This filter can be configured to work in one of two modes: + * <ul> + * <li><b>default to reject</b> ('-'): in this mode, only URLs that match + * suffixes specified in the config file will be accepted, all other URLs will + * be rejected.</li> + * <li><b>default to accept</b> ('+'): in this mode, only URLs that match + * suffixes specified in the config file will be rejected, all other URLs will + * be accepted.</li> + * </ul> + * <p> + * The format of this config file is one URL suffix per line, with no preceding + * whitespace. Order, in which suffixes are specified, doesn't matter. Blank + * lines and comments (#) are allowed. + * </p> + * <p> + * A single '+' or '-' sign not followed by any suffix must be used once, to + * signify the mode this plugin operates in. An optional single 'I' can be + * appended, to signify that suffix matches should be case-insensitive. The + * default, if not specified, is to use case-sensitive matches, i.e. suffix + * '.JPG' does not match '.jpg'. + * </p> + * <p> + * NOTE: the format of this file is different from urlfilter-prefix, because + * that plugin doesn't support allowed/prohibited prefixes (only supports + * allowed prefixes). Please note that this plugin does not support regular + * expressions, it only accepts literal suffixes. I.e. a suffix "+*.jpg" is most + * probably wrong, you should use "+.jpg" instead. + * </p> + * <h4>Example 1</h4> + * <p> + * The configuration shown below will accept all URLs with '.html' or '.htm' + * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), and prohibit + * all other suffixes. + * <p> + * + * <pre> + * # this is a comment + * + * # prohibit all unknown, case-sensitive matching + * - + * + * # collect only HTML files. + * .html + * .htm + * </pre> + * + * </p> + * <h4>Example 2</h4> + * <p> + * The configuration shown below will accept all URLs except common graphical + * formats. + * <p> + * + * <pre> + * # this is a comment + * + * # allow all unknown, case-insensitive matching + * +I + * + * # prohibited suffixes + * .gif + * .png + * .jpg + * .jpeg + * .bmp + * </pre> + * + * </p> + * + * @author Andrzej Bialecki + */ +public class SuffixURLFilter implements URLFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(SuffixURLFilter.class); + + // read in attribute "file" of this plugin. + private String attributeFile = null; + + private SuffixStringMatcher suffixes; + private boolean modeAccept = false; + private boolean filterFromPath = false; + private boolean ignoreCase = false; + + private Configuration conf; + + public SuffixURLFilter() throws IOException { + + } + + public SuffixURLFilter(Reader reader) throws IOException { + readConfiguration(reader); + } + + public String filter(String url) { + if (url == null) + return null; + String _url; + if (ignoreCase) + _url = url.toLowerCase(); + else + _url = url; + if (filterFromPath) { + try { + URL pUrl = new URL(_url); + _url = pUrl.getPath(); + } catch (MalformedURLException e) { + // don't care + } + } + + String a = suffixes.shortestMatch(_url); + if (a == null) { + if (modeAccept) + return url; + else + return null; + } else { + if (modeAccept) + return null; + else + return url; + } + } + + public void readConfiguration(Reader reader) throws IOException { + + // handle missing config file + if (reader == null) { + if (LOG.isWarnEnabled()) { + LOG.warn("Missing urlfilter.suffix.file, all URLs will be rejected!"); + } + suffixes = new SuffixStringMatcher(new String[0]); + modeAccept = false; + ignoreCase = false; + return; + } + BufferedReader in = new BufferedReader(reader); + List<String> aSuffixes = new ArrayList<String>(); + boolean allow = false; + boolean ignore = false; + String line; + + while ((line = in.readLine()) != null) { + line = line.trim(); + if (line.length() == 0) + continue; + + char first = line.charAt(0); + switch (first) { + case ' ': + case '\n': + case '#': // skip blank & comment lines + break; + case '-': + allow = false; + if (line.contains("P")) + filterFromPath = true; + if (line.contains("I")) + ignore = true; + break; + case '+': + allow = true; + if (line.contains("P")) + filterFromPath = true; + if (line.contains("I")) + ignore = true; + break; + default: + aSuffixes.add(line); + } + } + if (ignore) { + for (int i = 0; i < aSuffixes.size(); i++) { + aSuffixes.set(i, ((String) aSuffixes.get(i)).toLowerCase()); + } + } + suffixes = new SuffixStringMatcher(aSuffixes); + modeAccept = allow; + ignoreCase = ignore; + } + + public static void main(String args[]) throws IOException { + + SuffixURLFilter filter; + if (args.length >= 1) + filter = new SuffixURLFilter(new FileReader(args[0])); + else { + filter = new SuffixURLFilter(); + filter.setConf(NutchConfiguration.create()); + } + + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + String line; + while ((line = in.readLine()) != null) { + String out = filter.filter(line); + if (out != null) { + System.out.println("ACCEPTED " + out); + } else { + System.out.println("REJECTED " + out); + } + } + } + + public void setConf(Configuration conf) { + this.conf = conf; + + String pluginName = "urlfilter-suffix"; + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint(URLFilter.class.getName()).getExtensions(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + if (extension.getDescriptor().getPluginId().equals(pluginName)) { + attributeFile = extension.getAttribute("file"); + break; + } + } + if (attributeFile != null && attributeFile.trim().equals("")) + attributeFile = null; + if (attributeFile != null) { + if (LOG.isInfoEnabled()) { + LOG.info("Attribute \"file\" is defined for plugin " + pluginName + + " as " + attributeFile); + } + } else { + // if (LOG.isWarnEnabled()) { + // LOG.warn("Attribute \"file\" is not defined in plugin.xml for + // plugin "+pluginName); + // } + } + + String file = conf.get("urlfilter.suffix.file"); + String stringRules = conf.get("urlfilter.suffix.rules"); + // attribute "file" takes precedence if defined + if (attributeFile != null) + file = attributeFile; + Reader reader = null; + if (stringRules != null) { // takes precedence over files + reader = new StringReader(stringRules); + } else { + reader = conf.getConfResourceAsReader(file); + } + + try { + readConfiguration(reader); + } catch (IOException e) { + if (LOG.isErrorEnabled()) { + LOG.error(e.getMessage()); + } + throw new RuntimeException(e.getMessage(), e); + } + } + + public Configuration getConf() { + return this.conf; + } + + public boolean isModeAccept() { + return modeAccept; + } + + public void setModeAccept(boolean modeAccept) { + this.modeAccept = modeAccept; + } + + public boolean isIgnoreCase() { + return ignoreCase; + } + + public void setIgnoreCase(boolean ignoreCase) { + this.ignoreCase = ignoreCase; + } + + public void setFilterFromPath(boolean filterFromPath) { + this.filterFromPath = filterFromPath; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/package-info.java b/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/package-info.java new file mode 100644 index 0000000..0449acc --- /dev/null +++ b/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL filter plugin to either exclude or include only URLs which match + * one of the given (path) suffixes. + */ +package org.apache.nutch.urlfilter.suffix; + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java b/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java new file mode 100644 index 0000000..b09ca2f --- /dev/null +++ b/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java @@ -0,0 +1,123 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.suffix; + +import java.io.IOException; +import java.io.StringReader; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/** + * JUnit test for <code>SuffixURLFilter</code>. + * + * @author Andrzej Bialecki + */ +public class TestSuffixURLFilter { + private static final String suffixes = "# this is a comment\n" + "\n" + + ".gif\n" + ".jpg\n" + ".js\n"; + + private static final String[] urls = new String[] { + "http://www.example.com/test.gif", "http://www.example.com/TEST.GIF", + "http://www.example.com/test.jpg", "http://www.example.com/test.JPG", + "http://www.example.com/test.html", "http://www.example.com/test.HTML", + "http://www.example.com/test.html?q=abc.js", + "http://www.example.com/test.js?foo=bar&baz=bar#12333", }; + + private static String[] urlsModeAccept = new String[] { null, urls[1], null, + urls[3], urls[4], urls[5], null, urls[7] }; + + private static String[] urlsModeReject = new String[] { urls[0], null, + urls[2], null, null, null, urls[6], null }; + + private static String[] urlsModeAcceptIgnoreCase = new String[] { null, null, + null, null, urls[4], urls[5], null, urls[7] }; + + private static String[] urlsModeRejectIgnoreCase = new String[] { urls[0], + urls[1], urls[2], urls[3], null, null, urls[6], null }; + + private static String[] urlsModeAcceptAndPathFilter = new String[] { null, + urls[1], null, urls[3], urls[4], urls[5], urls[6], null }; + + private static String[] urlsModeAcceptAndNonPathFilter = new String[] { null, + urls[1], null, urls[3], urls[4], urls[5], null, urls[7] }; + + private SuffixURLFilter filter = null; + + @Before + public void setUp() throws IOException { + filter = new SuffixURLFilter(new StringReader(suffixes)); + } + + @Test + public void testModeAccept() { + filter.setIgnoreCase(false); + filter.setModeAccept(true); + for (int i = 0; i < urls.length; i++) { + Assert.assertTrue(urlsModeAccept[i] == filter.filter(urls[i])); + } + } + + @Test + public void testModeReject() { + filter.setIgnoreCase(false); + filter.setModeAccept(false); + for (int i = 0; i < urls.length; i++) { + Assert.assertTrue(urlsModeReject[i] == filter.filter(urls[i])); + } + } + + @Test + public void testModeAcceptIgnoreCase() { + filter.setIgnoreCase(true); + filter.setModeAccept(true); + for (int i = 0; i < urls.length; i++) { + Assert.assertTrue(urlsModeAcceptIgnoreCase[i] == filter.filter(urls[i])); + } + } + + @Test + public void testModeRejectIgnoreCase() { + filter.setIgnoreCase(true); + filter.setModeAccept(false); + for (int i = 0; i < urls.length; i++) { + Assert.assertTrue(urlsModeRejectIgnoreCase[i] == filter.filter(urls[i])); + } + } + + @Test + public void testModeAcceptAndNonPathFilter() { + filter.setModeAccept(true); + filter.setFilterFromPath(false); + for (int i = 0; i < urls.length; i++) { + Assert.assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter + .filter(urls[i])); + } + } + + @Test + public void testModeAcceptAndPathFilter() { + filter.setModeAccept(true); + filter.setFilterFromPath(true); + for (int i = 0; i < urls.length; i++) { + Assert.assertTrue(urlsModeAcceptAndPathFilter[i] == filter + .filter(urls[i])); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-validator/build.xml b/nutch-plugins/urlfilter-validator/build.xml new file mode 100644 index 0000000..4de9292 --- /dev/null +++ b/nutch-plugins/urlfilter-validator/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlfilter-validator" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-validator/ivy.xml b/nutch-plugins/urlfilter-validator/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/urlfilter-validator/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-validator/plugin.xml b/nutch-plugins/urlfilter-validator/plugin.xml new file mode 100644 index 0000000..413b288 --- /dev/null +++ b/nutch-plugins/urlfilter-validator/plugin.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlfilter-validator" + name="URL Validator" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlfilter-validator.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlfilter.validator" + name="Nutch URL Validatorr" + point="org.apache.nutch.net.URLFilter"> + <implementation id="URLValidator" + class="org.apache.nutch.urlfilter.validator.UrlValidator"/> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-validator/pom.xml b/nutch-plugins/urlfilter-validator/pom.xml new file mode 100644 index 0000000..9eaf641 --- /dev/null +++ b/nutch-plugins/urlfilter-validator/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlfilter-validator</artifactId> + <packaging>jar</packaging> + + <name>urlfilter-validator</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/UrlValidator.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/UrlValidator.java b/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/UrlValidator.java new file mode 100644 index 0000000..03fca97 --- /dev/null +++ b/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/UrlValidator.java @@ -0,0 +1,386 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.validator; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLFilter; + +/** + * <p> + * Validates URLs. + * </p> + * + * <p> + * Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: + * 03/07/02, http://javascript.internet.com. However, this validation now bears + * little resemblance to the php original. + * </p> + * + * <pre> + * Example of usage: + * UrlValidator urlValidator = UrlValidator.get(); + * if (urlValidator.isValid("ftp://foo.bar.com/")) { + * System.out.println("url is valid"); + * } else { + * System.out.println("url is invalid"); + * } + * + * prints out "url is valid" + * </pre> + * + * <p> + * Based on UrlValidator code from Apache commons-validator. + * </p> + * + * @see <a href='http://www.ietf.org/rfc/rfc2396.txt' > Uniform Resource + * Identifiers (URI): Generic Syntax </a> + * + */ +public class UrlValidator implements URLFilter { + + private static final String ALPHA_CHARS = "a-zA-Z"; + + private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d"; + + private static final String SPECIAL_CHARS = ";/@&=,.?:+$"; + + private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]"; + + private static final String SCHEME_CHARS = ALPHA_CHARS; + + // Drop numeric, and "+-." for now + private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "\\-\\."; + + private static final String ATOM = VALID_CHARS + '+'; + + /** + * This expression derived/taken from the BNF for URI (RFC2396). + */ + private static final Pattern URL_PATTERN = Pattern + .compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)" + + "(\\?([^#]*))?(#(.*))?"); + + /** + * Schema/Protocol (ie. http:, ftp:, file:, etc). + */ + private static final int PARSE_URL_SCHEME = 2; + + /** + * Includes hostname/ip and port number. + */ + private static final int PARSE_URL_AUTHORITY = 4; + + private static final int PARSE_URL_PATH = 5; + + private static final int PARSE_URL_QUERY = 7; + + /** + * Protocol (ie. http:, ftp:,https:). + */ + private static final Pattern SCHEME_PATTERN = Pattern.compile("^[" + + SCHEME_CHARS + "]+"); + + private static final Pattern AUTHORITY_PATTERN = Pattern.compile("^([" + + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?"); + + private static final int PARSE_AUTHORITY_HOST_IP = 1; + + private static final int PARSE_AUTHORITY_PORT = 2; + + /** + * Should always be empty. + */ + private static final int PARSE_AUTHORITY_EXTRA = 3; + + private static final Pattern PATH_PATTERN = Pattern + .compile("^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$"); + + private static final Pattern QUERY_PATTERN = Pattern.compile("^(.*)$"); + + private static final Pattern LEGAL_ASCII_PATTERN = Pattern + .compile("^[\\x21-\\x7E]+$"); + + private static final Pattern IP_V4_DOMAIN_PATTERN = Pattern + .compile("^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$"); + + private static final Pattern DOMAIN_PATTERN = Pattern.compile("^" + ATOM + + "(\\." + ATOM + ")*$"); + + private static final Pattern PORT_PATTERN = Pattern.compile("^:(\\d{1,5})$"); + + private static final Pattern ATOM_PATTERN = Pattern.compile("(" + ATOM + ")"); + + private static final Pattern ALPHA_PATTERN = Pattern.compile("^[" + + ALPHA_CHARS + "]"); + + private Configuration conf; + + public String filter(String urlString) { + return isValid(urlString) ? urlString : null; + } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + /** + * <p> + * Checks if a field has a valid url address. + * </p> + * + * @param value + * The value validation is being performed on. A <code>null</code> + * value is considered invalid. + * @return true if the url is valid. + */ + private boolean isValid(String value) { + if (value == null) { + return false; + } + + Matcher matchUrlPat = URL_PATTERN.matcher(value); + if (!LEGAL_ASCII_PATTERN.matcher(value).matches()) { + return false; + } + + // Check the whole url address structure + if (!matchUrlPat.matches()) { + return false; + } + + if (!isValidScheme(matchUrlPat.group(PARSE_URL_SCHEME))) { + return false; + } + + if (!isValidAuthority(matchUrlPat.group(PARSE_URL_AUTHORITY))) { + return false; + } + + if (!isValidPath(matchUrlPat.group(PARSE_URL_PATH))) { + return false; + } + + if (!isValidQuery(matchUrlPat.group(PARSE_URL_QUERY))) { + return false; + } + + return true; + } + + /** + * Validate scheme. If schemes[] was initialized to a non null, then only + * those scheme's are allowed. Note this is slightly different than for the + * constructor. + * + * @param scheme + * The scheme to validate. A <code>null</code> value is considered + * invalid. + * @return true if valid. + */ + private boolean isValidScheme(String scheme) { + if (scheme == null) { + return false; + } + + return SCHEME_PATTERN.matcher(scheme).matches(); + } + + /** + * Returns true if the authority is properly formatted. An authority is the + * combination of hostname and port. A <code>null</code> authority value is + * considered invalid. + * + * @param authority + * Authority value to validate. + * @return true if authority (hostname and port) is valid. + */ + private boolean isValidAuthority(String authority) { + if (authority == null) { + return false; + } + + Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authority); + if (!authorityMatcher.matches()) { + return false; + } + + boolean ipV4Address = false; + boolean hostname = false; + // check if authority is IP address or hostname + String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); + Matcher matchIPV4Pat = IP_V4_DOMAIN_PATTERN.matcher(hostIP); + ipV4Address = matchIPV4Pat.matches(); + + if (ipV4Address) { + // this is an IP address so check components + for (int i = 1; i <= 4; i++) { + String ipSegment = matchIPV4Pat.group(i); + if (ipSegment == null || ipSegment.length() <= 0) { + return false; + } + + try { + if (Integer.parseInt(ipSegment) > 255) { + return false; + } + } catch (NumberFormatException e) { + return false; + } + + } + } else { + // Domain is hostname name + hostname = DOMAIN_PATTERN.matcher(hostIP).matches(); + } + + // rightmost hostname will never start with a digit. + if (hostname) { + // LOW-TECH FIX FOR VALIDATOR-202 + // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203 + char[] chars = hostIP.toCharArray(); + int size = 1; + for (int i = 0; i < chars.length; i++) { + if (chars[i] == '.') { + size++; + } + } + String[] domainSegment = new String[size]; + int segCount = 0; + int segLen = 0; + Matcher atomMatcher = ATOM_PATTERN.matcher(hostIP); + + while (atomMatcher.find()) { + domainSegment[segCount] = atomMatcher.group(); + segLen = domainSegment[segCount].length() + 1; + hostIP = (segLen >= hostIP.length()) ? "" : hostIP.substring(segLen); + segCount++; + } + String topLevel = domainSegment[segCount - 1]; + if (topLevel.length() < 2 || topLevel.length() > 4) { + return false; + } + + // First letter of top level must be a alpha + if (!ALPHA_PATTERN.matcher(topLevel.substring(0, 1)).matches()) { + return false; + } + + // Make sure there's a host name preceding the authority. + if (segCount < 2) { + return false; + } + } + + if (!hostname && !ipV4Address) { + return false; + } + + String port = authorityMatcher.group(PARSE_AUTHORITY_PORT); + if (port != null) { + if (!PORT_PATTERN.matcher(port).matches()) { + return false; + } + } + + String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA); + return isBlankOrNull(extra); + } + + /** + * <p> + * Checks if the field isn't null and length of the field is greater than zero + * not including whitespace. + * </p> + * + * @param value + * The value validation is being performed on. + * @return true if blank or null. + */ + private boolean isBlankOrNull(String value) { + return ((value == null) || (value.trim().length() == 0)); + } + + /** + * Returns true if the path is valid. A <code>null</code> value is considered + * invalid. + * + * @param path + * Path value to validate. + * @return true if path is valid. + */ + private boolean isValidPath(String path) { + if (path == null) { + return false; + } + + if (!PATH_PATTERN.matcher(path).matches()) { + return false; + } + + int slash2Count = countToken("//", path); + int slashCount = countToken("/", path); + int dot2Count = countToken("..", path); + + return (dot2Count <= 0) || ((slashCount - slash2Count - 1) > dot2Count); + } + + /** + * Returns true if the query is null or it's a properly formatted query + * string. + * + * @param query + * Query value to validate. + * @return true if query is valid. + */ + private boolean isValidQuery(String query) { + if (query == null) { + return true; + } + + return QUERY_PATTERN.matcher(query).matches(); + } + + /** + * Returns the number of times the token appears in the target. + * + * @param token + * Token value to be counted. + * @param target + * Target value to count tokens in. + * @return the number of tokens. + */ + private int countToken(String token, String target) { + int tokenIndex = 0; + int count = 0; + while (tokenIndex != -1) { + tokenIndex = target.indexOf(token, tokenIndex); + if (tokenIndex > -1) { + tokenIndex++; + count++; + } + } + return count; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/package.html b/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/package.html new file mode 100644 index 0000000..b5ec8a1 --- /dev/null +++ b/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/package.html @@ -0,0 +1,9 @@ +<html> +<body> +<p>URL filter plugin that validates given urls.</p> +<p>This plugin runs a series of tests for the given url to make sure that given +url is valid and 'fetchable'.</p> +<p>Note: This plugin should <b>only</b> be used for web-related protocols such +as http, https and ftp.</p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java b/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java new file mode 100644 index 0000000..2e6d695 --- /dev/null +++ b/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.validator; + +import org.apache.nutch.urlfilter.validator.UrlValidator; +import org.junit.Assert; +import org.junit.Test; + +/** + * JUnit test case which tests 1. that valid urls are not filtered while invalid + * ones are filtered. 2. that Urls' scheme, authority, path and query are + * validated. + * + * @author tejasp + * + */ + +public class TestUrlValidator { + + /** + * Test method for + * {@link org.apache.nutch.urlfilter.validator.UrlValidator#filter(java.lang.String)} + * . + */ + @Test + public void testFilter() { + UrlValidator url_validator = new UrlValidator(); + Assert.assertNotNull(url_validator); + + Assert.assertNull("Filtering on a null object should return null", + url_validator.filter(null)); + Assert.assertNull("Invalid url: example.com/file[/].html", + url_validator.filter("example.com/file[/].html")); + Assert.assertNull("Invalid url: http://www.example.com/space here.html", + url_validator.filter("http://www.example.com/space here.html")); + Assert.assertNull("Invalid url: /main.html", + url_validator.filter("/main.html")); + Assert.assertNull("Invalid url: www.example.com/main.html", + url_validator.filter("www.example.com/main.html")); + Assert.assertNull("Invalid url: ftp:www.example.com/main.html", + url_validator.filter("ftp:www.example.com/main.html")); + Assert.assertNull( + "Inalid url: http://999.000.456.32/nutch/trunk/README.txt", + url_validator.filter("http://999.000.456.32/nutch/trunk/README.txt")); + Assert.assertNull("Invalid url: http://www.example.com/ma|in\\toc.html", + url_validator.filter(" http://www.example.com/ma|in\\toc.html")); + + Assert.assertNotNull( + "Valid url: https://issues.apache.org/jira/NUTCH-1127", + url_validator.filter("https://issues.apache.org/jira/NUTCH-1127")); + Assert + .assertNotNull( + "Valid url: http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather", + url_validator + .filter("http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather")); + Assert + .assertNotNull( + "Valid url: http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress", + url_validator + .filter("http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress")); + Assert.assertNotNull("Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf", + url_validator.filter("ftp://alfa.bravo.pi/mike/check/plan.pdf")); + + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlmeta/build.xml b/nutch-plugins/urlmeta/build.xml new file mode 100644 index 0000000..ed8d9c9 --- /dev/null +++ b/nutch-plugins/urlmeta/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlmeta" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlmeta/ivy.xml b/nutch-plugins/urlmeta/ivy.xml new file mode 100644 index 0000000..24d7606 --- /dev/null +++ b/nutch-plugins/urlmeta/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../../ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlmeta/plugin.xml b/nutch-plugins/urlmeta/plugin.xml new file mode 100644 index 0000000..c31adf6 --- /dev/null +++ b/nutch-plugins/urlmeta/plugin.xml @@ -0,0 +1,47 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlmeta" + name="URL Meta Indexing Filter" + version="1.0.0" + provider-name="sgonyea"> + + + <runtime> + <library name="urlmeta.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.indexer.urlmeta" + name="URL Meta Indexing Filter" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="indexer-urlmeta" + class="org.apache.nutch.indexer.urlmeta.URLMetaIndexingFilter"/> + </extension> + <extension id="org.apache.nutch.scoring.urlmeta" + name="URL Meta Scoring Filter" + point="org.apache.nutch.scoring.ScoringFilter"> + <implementation id="scoring-urlmeta" + class="org.apache.nutch.scoring.urlmeta.URLMetaScoringFilter" /> + </extension> +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlmeta/pom.xml b/nutch-plugins/urlmeta/pom.xml new file mode 100644 index 0000000..cba0b62 --- /dev/null +++ b/nutch-plugins/urlmeta/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlmeta</artifactId> + <packaging>jar</packaging> + + <name>urlmeta</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java new file mode 100644 index 0000000..dc673a2 --- /dev/null +++ b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java @@ -0,0 +1,118 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer.urlmeta; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.Parse; + +/** + * This is part of the URL Meta plugin. It is designed to enhance the NUTCH-655 + * patch, by doing two things: 1. Meta Tags that are supplied with your Crawl + * URLs, during injection, will be propagated throughout the outlinks of those + * Crawl URLs. 2. When you index your URLs, the meta tags that you specified + * with your URLs will be indexed alongside those URLs--and can be directly + * queried, assuming you have done everything else correctly. + * + * The flat-file of URLs you are injecting should, per NUTCH-655, be + * tab-delimited in the form of: + * + * [www.url.com]\t[key1]=[value1]\t[key2]=[value2]...[keyN]=[valueN] + * + * Be aware that if you collide with keywords that are already in use (such as + * nutch.score/nutch.fetchInterval) then you are in for some unpredictable + * behavior. + * + * Furthermore, in your nutch-site.xml config, you must specify that this plugin + * is to be used (1), as well as what (2) Meta Tags it should actively look for. + * This does not mean that you must use these tags for every URL, but it does + * mean that you must list _all_ of meta tags that you have specified. If you + * want them to be propagated and indexed, that is. + * + * 1. As of Nutch 1.2, the property "plugin.includes" looks as follows: + * <value>protocol-http|urlfilter-regex|parse-(text|html|js|tika|rss)|index + * -(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic + * |scoring-opic|urlnormalizer-(pass|regex|basic)</value> You must change + * "index-(basic|anchor)" to "index-(basic|anchor|urlmeta)", in order to call + * this plugin. + * + * 2. You must also specify the property "urlmeta.tags", who's values are + * comma-delimited <value>key1, key2, key3</value> + * + * TODO: It may be ideal to offer two separate properties, to specify what gets + * indexed versus merely propagated. + * + */ +public class URLMetaIndexingFilter implements IndexingFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(URLMetaIndexingFilter.class); + private static final String CONF_PROPERTY = "urlmeta.tags"; + private static String[] urlMetaTags; + private Configuration conf; + + /** + * This will take the metatags that you have listed in your "urlmeta.tags" + * property, and looks for them inside the CrawlDatum object. If they exist, + * this will add it as an attribute inside the NutchDocument. + * + * @see IndexingFilter#filter + */ + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + if (conf != null) + this.setConf(conf); + + if (urlMetaTags == null || doc == null) + return doc; + + for (String metatag : urlMetaTags) { + Text metadata = (Text) datum.getMetaData().get(new Text(metatag)); + + if (metadata != null) + doc.add(metatag, metadata.toString()); + } + + return doc; + } + + /** Boilerplate */ + public Configuration getConf() { + return conf; + } + + /** + * handles conf assignment and pulls the value assignment from the + * "urlmeta.tags" property + */ + public void setConf(Configuration conf) { + this.conf = conf; + + if (conf == null) + return; + + urlMetaTags = conf.getStrings(CONF_PROPERTY); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/package.html b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/package.html new file mode 100644 index 0000000..5da5d56 --- /dev/null +++ b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/package.html @@ -0,0 +1,12 @@ +<html> + <body> + <p> + URL Meta Tag Indexing Plugin + </p> + <p> + Takes Meta Tags, injected alongside a URL (see NUTCH-655) and specified in the "urlmeta.tags" property, + and inserts them into the document--which is then sent to the Indexer. If you specify these fields in + the Nutch's schema (as well as the Indexer's), you can reasonably assume that they will be indexed. + </p> + </body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java new file mode 100644 index 0000000..3965e42 --- /dev/null +++ b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java @@ -0,0 +1,175 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.scoring.urlmeta; + +import java.util.Collection; +import java.util.Map.Entry; +import java.util.Iterator; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.scoring.ScoringFilter; +import org.apache.nutch.scoring.ScoringFilterException; + +/** + * For documentation: + * + * @see URLMetaIndexingFilter + */ +public class URLMetaScoringFilter extends Configured implements ScoringFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(URLMetaScoringFilter.class); + private static final String CONF_PROPERTY = "urlmeta.tags"; + private static String[] urlMetaTags; + private Configuration conf; + + /** + * This will take the metatags that you have listed in your "urlmeta.tags" + * property, and looks for them inside the parseData object. If they exist, + * this will be propagated into your 'targets' Collection's ["outlinks"] + * attributes. + * + * @see ScoringFilter#distributeScoreToOutlinks + */ + public CrawlDatum distributeScoreToOutlinks(Text fromUrl, + ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, + CrawlDatum adjust, int allCount) throws ScoringFilterException { + if (urlMetaTags == null || targets == null || parseData == null) + return adjust; + + Iterator<Entry<Text, CrawlDatum>> targetIterator = targets.iterator(); + + while (targetIterator.hasNext()) { + Entry<Text, CrawlDatum> nextTarget = targetIterator.next(); + + for (String metatag : urlMetaTags) { + String metaFromParse = parseData.getMeta(metatag); + + if (metaFromParse == null) + continue; + + nextTarget.getValue().getMetaData() + .put(new Text(metatag), new Text(metaFromParse)); + } + } + return adjust; + } + + /** + * Takes the metadata, specified in your "urlmeta.tags" property, from the + * datum object and injects it into the content. This is transfered to the + * parseData object. + * + * @see ScoringFilter#passScoreBeforeParsing + * @see URLMetaScoringFilter#passScoreAfterParsing + */ + public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) { + if (urlMetaTags == null || content == null || datum == null) + return; + + for (String metatag : urlMetaTags) { + Text metaFromDatum = (Text) datum.getMetaData().get(new Text(metatag)); + + if (metaFromDatum == null) + continue; + + content.getMetadata().set(metatag, metaFromDatum.toString()); + } + } + + /** + * Takes the metadata, which was lumped inside the content, and replicates it + * within your parse data. + * + * @see URLMetaScoringFilter#passScoreBeforeParsing + * @see ScoringFilter#passScoreAfterParsing + */ + public void passScoreAfterParsing(Text url, Content content, Parse parse) { + if (urlMetaTags == null || content == null || parse == null) + return; + + for (String metatag : urlMetaTags) { + String metaFromContent = content.getMetadata().get(metatag); + + if (metaFromContent == null) + continue; + + parse.getData().getParseMeta().set(metatag, metaFromContent); + } + } + + /** Boilerplate */ + public float generatorSortValue(Text url, CrawlDatum datum, float initSort) + throws ScoringFilterException { + return initSort; + } + + /** Boilerplate */ + public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, + CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) + throws ScoringFilterException { + return initScore; + } + + /** Boilerplate */ + public void initialScore(Text url, CrawlDatum datum) + throws ScoringFilterException { + return; + } + + /** Boilerplate */ + public void injectedScore(Text url, CrawlDatum datum) + throws ScoringFilterException { + return; + } + + /** Boilerplate */ + public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, + List<CrawlDatum> inlinked) throws ScoringFilterException { + return; + } + + /** + * handles conf assignment and pulls the value assignment from the + * "urlmeta.tags" property + */ + public void setConf(Configuration conf) { + super.setConf(conf); + + if (conf == null) + return; + + urlMetaTags = conf.getStrings(CONF_PROPERTY); + } + + /** Boilerplate */ + public Configuration getConf() { + return conf; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/package.html b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/package.html new file mode 100644 index 0000000..5bba7a8 --- /dev/null +++ b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/package.html @@ -0,0 +1,11 @@ +<html> + <body> + <p> + URL Meta Tag Scoring Plugin + </p> + <p> + Propagates Meta Tags, injected alongside a URL (see NUTCH-655) and specified in the "urlmeta.tags" property, + along to their outlinks. This does not actually perform scoring. + </p> + </body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-ajax/build.xml b/nutch-plugins/urlnormalizer-ajax/build.xml new file mode 100644 index 0000000..e100f8a --- /dev/null +++ b/nutch-plugins/urlnormalizer-ajax/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlnormalizer-ajax" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-ajax/ivy.xml b/nutch-plugins/urlnormalizer-ajax/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/urlnormalizer-ajax/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-ajax/plugin.xml b/nutch-plugins/urlnormalizer-ajax/plugin.xml new file mode 100644 index 0000000..ad8c72c --- /dev/null +++ b/nutch-plugins/urlnormalizer-ajax/plugin.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlnormalizer-ajax" + name="AJAX URL Normalizer" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlnormalizer-ajax.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlnormalizer.ajax" + name="Nutch AJAX URL Normalizer" + point="org.apache.nutch.net.URLNormalizer"> + <implementation id="AjaxURLNormalizer" + class="org.apache.nutch.net.urlnormalizer.ajax.AjaxURLNormalizer"/> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-ajax/pom.xml b/nutch-plugins/urlnormalizer-ajax/pom.xml new file mode 100644 index 0000000..e32d952 --- /dev/null +++ b/nutch-plugins/urlnormalizer-ajax/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlnormalizer-ajax</artifactId> + <packaging>jar</packaging> + + <name>urlnormalizer-ajax</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/src/main/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-ajax/src/main/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java b/nutch-plugins/urlnormalizer-ajax/src/main/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java new file mode 100644 index 0000000..5286f6f --- /dev/null +++ b/nutch-plugins/urlnormalizer-ajax/src/main/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java @@ -0,0 +1,236 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.ajax; + +import java.net.URL; +import java.net.URI; +import java.net.URLEncoder; +import java.net.URLDecoder; +import java.net.MalformedURLException; +import java.nio.charset.Charset; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.nutch.net.URLNormalizer; +import org.apache.nutch.net.URLNormalizers; +import org.apache.hadoop.conf.Configuration; + +/** + * URLNormalizer capable of dealing with AJAX URL's. + * + * Use the following regex filter to prevent escaped fragments from being fetched. + * ^(.*)\?.*_escaped_fragment_ + */ +public class AjaxURLNormalizer implements URLNormalizer { + public static final Logger LOG = LoggerFactory.getLogger(AjaxURLNormalizer.class); + + public static String AJAX_URL_PART = "#!"; + public static String ESCAPED_URL_PART = "_escaped_fragment_="; + + private Configuration conf; + private Charset utf8; + + /** + * Default constructor. + */ + public AjaxURLNormalizer() { + utf8 = Charset.forName("UTF-8"); + } + + /** + * Attempts to normalize the input URL string + * + * @param String urlString + * @return String + */ + public String normalize(String urlString, String scope) throws MalformedURLException { + LOG.info(scope + " // " + urlString); + + // When indexing, transform _escaped_fragment_ URL's to their #! counterpart + if (scope.equals(URLNormalizers.SCOPE_INDEXER) && urlString.contains(ESCAPED_URL_PART)) { + return normalizeEscapedFragment(urlString); + } + + // Otherwise transform #! URL's to their _escaped_fragment_ counterpart + if (urlString.contains(AJAX_URL_PART)) { + LOG.info(scope + " // " + normalizeHashedFragment(urlString)); + return normalizeHashedFragment(urlString); + } + + // Nothing to normalize here, return verbatim + return urlString; + } + + /** + * Returns a normalized input URL. #! querystrings are transformed + * to a _escaped_fragment_ form. + * + * @param String urlString + * @return String + */ + protected String normalizeHashedFragment(String urlString) throws MalformedURLException { + URL u = new URL(urlString); + int pos = urlString.indexOf(AJAX_URL_PART); + StringBuilder sb = new StringBuilder(urlString.substring(0, pos)); + + // Get the escaped fragment + String escapedFragment = escape(urlString.substring(pos + AJAX_URL_PART.length())); + + // Check if we already have a query in the URL + if (u.getQuery() == null) { + sb.append("?"); + } else { + sb.append("&"); + } + + // Append the escaped fragment key and the value + sb.append(ESCAPED_URL_PART); + sb.append(escapedFragment); + + return sb.toString(); + } + + /** + * Returns a normalized input URL. _escaped_fragment_ querystrings are + * transformed to a #! form. + * + * @param String urlString + * @return String + */ + protected String normalizeEscapedFragment(String urlString) throws MalformedURLException { + int pos = urlString.indexOf(ESCAPED_URL_PART); + URL u = new URL(urlString); + StringBuilder sb = new StringBuilder(); + + // Write the URL without query string, we'll handle that later + sb.append(u.getProtocol()); + sb.append("://"); + sb.append(u.getHost()); + if (u.getPort() != -1) { + sb.append(":"); + sb.append(u.getPort()); + } + sb.append(u.getPath()); + + // Get the query string + String queryString = u.getQuery(); + + // Check if there's an & in the query string + int ampPos = queryString.indexOf("&"); + String keyValuePair = null; + + // If there's none, then the escaped fragment is the only k/v pair + if (ampPos == -1) { + keyValuePair = queryString; + queryString = ""; + } else { + // Obtain the escaped k/v pair + keyValuePair = queryString.substring(ampPos + 1); + + // Remove the escaped fragment key/value pair from the query string + queryString = queryString.replaceFirst("&" + keyValuePair, ""); + } + + // Remove escapedUrlPart from the keyValuePair + keyValuePair = keyValuePair.replaceFirst(ESCAPED_URL_PART, ""); + + // Get the fragment escaped + String unescapedFragment = unescape(keyValuePair); + + // Append a possible query string, without original escaped fragment + if (queryString.length() > 0) { + sb.append("?"); + sb.append(queryString); + } + + // Append the fragment delimiter and the unescaped fragment + sb.append("#!"); + sb.append(unescapedFragment); + + return sb.toString(); + } + + /** + * Unescape some exotic characters in the fragment part + * + * @param String fragmentPart + * @return String + */ + protected String unescape(String fragmentPart) { + try { + fragmentPart = URLDecoder.decode(fragmentPart, "UTF-8"); + } catch (Exception e) { + /// bluh + } + + return fragmentPart; + } + + /** + * Escape some exotic characters in the fragment part + * + * @param String fragmentPart + * @return String + */ + protected String escape(String fragmentPart) { + String hex = null; + StringBuilder sb = new StringBuilder(fragmentPart.length()); + + for (byte b : fragmentPart.getBytes(utf8)) { + if (b < 33) { + sb.append('%'); + + hex = Integer.toHexString(b & 0xFF).toUpperCase(); + + // Prevent odd # chars + if (hex.length() % 2 != 0) { + sb.append('0'); + } + sb.append(hex); + } else if (b == 35) { + sb.append("%23"); + } else if (b == 37) { + sb.append("%25"); + } else if (b == 38) { + sb.append("%26"); + } else if (b == 43) { + sb.append("%2B"); + } else { + sb.append((char)b); + } + } + + return sb.toString(); + } + + /** + * @param Configuration conf + */ + public void setConf(Configuration conf) { + this.conf = conf; + } + + /** + * @return Configuration + */ + public Configuration getConf() { + return this.conf; + } + +} \ No newline at end of file
