Author: markus Date: Mon Jan 11 17:10:30 2016 New Revision: 1724085 URL: http://svn.apache.org/viewvc?rev=1724085&view=rev Log: NUTCH-2190 Protocol normalizer
Added: nutch/trunk/src/plugin/urlnormalizer-protocol/ nutch/trunk/src/plugin/urlnormalizer-protocol/build.xml nutch/trunk/src/plugin/urlnormalizer-protocol/data/ nutch/trunk/src/plugin/urlnormalizer-protocol/data/protocols.txt nutch/trunk/src/plugin/urlnormalizer-protocol/ivy.xml nutch/trunk/src/plugin/urlnormalizer-protocol/plugin.xml nutch/trunk/src/plugin/urlnormalizer-protocol/src/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/ nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java Modified: nutch/trunk/CHANGES.txt nutch/trunk/build.xml nutch/trunk/default.properties nutch/trunk/src/plugin/build.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1724085&r1=1724084&r2=1724085&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon Jan 11 17:10:30 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2190 Protocol normalizer (markus) + * NUTCH-1838 Host and domain based regex and automaton filtering (markus) * NUTCH-2178 DeduplicationJob to optionally group on host or domain (markus) Modified: nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1724085&r1=1724084&r2=1724085&view=diff ============================================================================== --- nutch/trunk/build.xml (original) +++ nutch/trunk/build.xml Mon Jan 11 17:10:30 2016 @@ -224,6 +224,7 @@ <packageset dir="${plugins.dir}/urlnormalizer-basic/src/java"/> <packageset dir="${plugins.dir}/urlnormalizer-host/src/java"/> <packageset dir="${plugins.dir}/urlnormalizer-pass/src/java"/> + <packageset dir="${plugins.dir}/urlnormalizer-protocol/src/java"/> <packageset dir="${plugins.dir}/urlnormalizer-querystring/src/java"/> <packageset dir="${plugins.dir}/urlnormalizer-regex/src/java"/> <packageset dir="${plugins.dir}/urlnormalizer-slash/src/java"/> @@ -660,6 +661,7 @@ <packageset dir="${plugins.dir}/urlnormalizer-basic/src/java"/> <packageset dir="${plugins.dir}/urlnormalizer-host/src/java"/> <packageset dir="${plugins.dir}/urlnormalizer-pass/src/java"/> + <packageset dir="${plugins.dir}/urlnormalizer-protocol/src/java"/> <packageset dir="${plugins.dir}/urlnormalizer-querystring/src/java"/> <packageset dir="${plugins.dir}/urlnormalizer-regex/src/java"/> <packageset dir="${plugins.dir}/urlnormalizer-slash/src/java"/> @@ -1082,6 +1084,8 @@ <source path="${plugins.dir}/urlnormalizer-host/src/test/" /> <source path="${plugins.dir}/urlnormalizer-pass/src/java/" /> <source path="${plugins.dir}/urlnormalizer-pass/src/test/" /> + <source path="${plugins.dir}/urlnormalizer-protocol/src/java/" /> + <source path="${plugins.dir}/urlnormalizer-protocol/src/test/" /> <source path="${plugins.dir}/urlnormalizer-querystring/src/java/" /> <source path="${plugins.dir}/urlnormalizer-querystring/src/test/" /> <source path="${plugins.dir}/urlnormalizer-regex/src/java/" /> Modified: nutch/trunk/default.properties URL: http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1724085&r1=1724084&r2=1724085&view=diff ============================================================================== --- nutch/trunk/default.properties (original) +++ nutch/trunk/default.properties Mon Jan 11 17:10:30 2016 @@ -110,6 +110,7 @@ plugins.urlnormalizer=\ org.apache.nutch.net.urlnormalizer.basic*:\ org.apache.nutch.net.urlnormalizer.host*:\ org.apache.nutch.net.urlnormalizer.pass*:\ + org.apache.nutch.net.urlnormalizer.protocol*:\ org.apache.nutch.net.urlnormalizer.querystring*:\ org.apache.nutch.net.urlnormalizer.regex* Modified: nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1724085&r1=1724084&r2=1724085&view=diff ============================================================================== --- nutch/trunk/src/plugin/build.xml (original) +++ nutch/trunk/src/plugin/build.xml Mon Jan 11 17:10:30 2016 @@ -82,6 +82,7 @@ <ant dir="urlnormalizer-basic" target="deploy"/> <ant dir="urlnormalizer-host" target="deploy"/> <ant dir="urlnormalizer-pass" target="deploy"/> + <ant dir="urlnormalizer-protocol" target="deploy"/> <ant dir="urlnormalizer-querystring" target="deploy"/> <ant dir="urlnormalizer-regex" target="deploy"/> <ant dir="urlnormalizer-slash" target="deploy"/> @@ -125,6 +126,7 @@ <ant dir="urlnormalizer-basic" target="test"/> <ant dir="urlnormalizer-host" target="test"/> <ant dir="urlnormalizer-pass" target="test"/> + <ant dir="urlnormalizer-protocol" target="test"/> <ant dir="urlnormalizer-querystring" target="test"/> <ant dir="urlnormalizer-regex" target="test"/> <ant dir="urlnormalizer-slash" target="test"/> @@ -193,6 +195,7 @@ <ant dir="urlnormalizer-basic" target="clean"/> <ant dir="urlnormalizer-host" target="clean"/> <ant dir="urlnormalizer-pass" target="clean"/> + <ant dir="urlnormalizer-protocol" target="clean"/> <ant dir="urlnormalizer-querystring" target="clean"/> <ant dir="urlnormalizer-regex" target="clean"/> <ant dir="urlnormalizer-slash" target="clean"/> Added: nutch/trunk/src/plugin/urlnormalizer-protocol/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-protocol/build.xml?rev=1724085&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-protocol/build.xml (added) +++ nutch/trunk/src/plugin/urlnormalizer-protocol/build.xml Mon Jan 11 17:10:30 2016 @@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlnormalizer-protocol" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="data" /> + </copy> +</project> Added: nutch/trunk/src/plugin/urlnormalizer-protocol/data/protocols.txt URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-protocol/data/protocols.txt?rev=1724085&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-protocol/data/protocols.txt (added) +++ nutch/trunk/src/plugin/urlnormalizer-protocol/data/protocols.txt Mon Jan 11 17:10:30 2016 @@ -0,0 +1,7 @@ +# format: host\tprotocol\n + +example.org http +example.net http + +example.io https +example.nl https Added: nutch/trunk/src/plugin/urlnormalizer-protocol/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-protocol/ivy.xml?rev=1724085&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-protocol/ivy.xml (added) +++ nutch/trunk/src/plugin/urlnormalizer-protocol/ivy.xml Mon Jan 11 17:10:30 2016 @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="${nutch.root}/ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> Added: nutch/trunk/src/plugin/urlnormalizer-protocol/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-protocol/plugin.xml?rev=1724085&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-protocol/plugin.xml (added) +++ nutch/trunk/src/plugin/urlnormalizer-protocol/plugin.xml Mon Jan 11 17:10:30 2016 @@ -0,0 +1,43 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlnormalizer-protocol" + name="Protocol URL Normalizer" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlnormalizer-protocol.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlnormalizer.protocol" + name="Nutch Protocol URL Normalizer" + point="org.apache.nutch.net.URLNormalizer"> + <implementation id="ProtocolURLNormalizer" + class="org.apache.nutch.net.urlnormalizer.protocol.ProtocolURLNormalizer"> + <parameter name="file" value="protocols.txt"/> + </implementation> + </extension> + +</plugin> Added: nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java?rev=1724085&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java (added) +++ nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java Mon Jan 11 17:10:30 2016 @@ -0,0 +1,190 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.protocol; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.URLNormalizer; +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.util.URLUtil; + +/** + * @author mar...@openindex.io + */ +public class ProtocolURLNormalizer implements URLNormalizer { + + private Configuration conf; + + private static final Logger LOG = LoggerFactory.getLogger(ProtocolURLNormalizer.class); + + private static final char QUESTION_MARK = '?'; + private static final String PROTOCOL_DELIMITER = "://"; + + private static String attributeFile = null; + private String protocolsFile = null; + + // We record a map of hosts and boolean, the boolean denotes whether the host should + // have slashes after URL paths. True means slash, false means remove the slash + private static final Map<String,String> protocolsMap = new HashMap<String,String>(); + + public ProtocolURLNormalizer() {} + + public ProtocolURLNormalizer(String protocolsFile) { + this.protocolsFile = protocolsFile; + } + + private synchronized void readConfiguration(Reader configReader) throws IOException { + if (protocolsMap.size() > 0) { + return; + } + + BufferedReader reader = new BufferedReader(configReader); + String line, host; + String protocol; + int delimiterIndex; + + while ((line = reader.readLine()) != null) { + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { + line.trim(); + delimiterIndex = line.indexOf(" "); + // try tabulator + if (delimiterIndex == -1) { + delimiterIndex = line.indexOf("\t"); + } + + host = line.substring(0, delimiterIndex); + protocol = line.substring(delimiterIndex + 1).trim(); + + protocolsMap.put(host, protocol); + } + } + } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + + // get the extensions for domain urlfilter + String pluginName = "urlnormalizer-protocol"; + Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( + URLNormalizer.class.getName()).getExtensions(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + if (extension.getDescriptor().getPluginId().equals(pluginName)) { + attributeFile = extension.getAttribute("file"); + break; + } + } + + // handle blank non empty input + if (attributeFile != null && attributeFile.trim().equals("")) { + attributeFile = null; + } + + if (attributeFile != null) { + if (LOG.isInfoEnabled()) { + LOG.info("Attribute \"file\" is defined for plugin " + pluginName + + " as " + attributeFile); + } + } + else { + if (LOG.isWarnEnabled()) { + LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " + + pluginName); + } + } + + // domain file and attribute "file" take precedence if defined + String file = conf.get("urlnormalizer.protocols.file"); + String stringRules = conf.get("urlnormalizer.protocols.rules"); + if (protocolsFile != null) { + file = protocolsFile; + } + else if (attributeFile != null) { + file = attributeFile; + } + Reader reader = null; + if (stringRules != null) { // takes precedence over files + reader = new StringReader(stringRules); + } else { + reader = conf.getConfResourceAsReader(file); + } + try { + if (reader == null) { + reader = new FileReader(file); + } + readConfiguration(reader); + } + catch (IOException e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } + + public String normalize(String url, String scope) throws MalformedURLException { + return normalize(url, null, scope); + } + + public String normalize(String url, CrawlDatum crawlDatum, String scope) throws MalformedURLException { + // Get URL repr. + URL u = new URL(url); + + // Get the host + String host = u.getHost(); + + // Do we have a rule for this host? + if (protocolsMap.containsKey(host)) { + String protocol = u.getProtocol(); + String requiredProtocol = protocolsMap.get(host); + + // Incorrect protocol? + if (!protocol.equals(requiredProtocol)) { + // Rebuild URL with new protocol + StringBuilder buffer = new StringBuilder(requiredProtocol); + buffer.append(PROTOCOL_DELIMITER); + buffer.append(host); + buffer.append(u.getPath()); + + String queryString = u.getQuery(); + if (queryString != null) { + buffer.append(QUESTION_MARK); + buffer.append(queryString); + } + + url = buffer.toString(); + } + } + + return url; + } +} \ No newline at end of file Added: nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java?rev=1724085&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java (added) +++ nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java Mon Jan 11 17:10:30 2016 @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.protocol; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestProtocolURLNormalizer extends TestCase { + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + public void testProtocolURLNormalizer() throws Exception { + Configuration conf = NutchConfiguration.create(); + + String protocolsFile = SAMPLES + SEPARATOR + "protocols.txt"; + ProtocolURLNormalizer normalizer = new ProtocolURLNormalizer(protocolsFile); + normalizer.setConf(conf); + + // No change + assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT)); + + // https to http + assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT)); + + // no change + assertEquals("https://example.io/", normalizer.normalize("https://example.io/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("https://example.nl/", normalizer.normalize("https://example.nl/", URLNormalizers.SCOPE_DEFAULT)); + + // http to https + assertEquals("https://example.io/", normalizer.normalize("http://example.io/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("https://example.nl/", normalizer.normalize("http://example.nl/", URLNormalizers.SCOPE_DEFAULT)); + } +}