Author: markus Date: Tue Jun 12 10:33:18 2012 New Revision: 1349236 URL: http://svn.apache.org/viewvc?rev=1349236&view=rev Log: NUTCH-1319 HostNormalizer plugin
Added: nutch/trunk/conf/host-urlnormalizer.txt nutch/trunk/src/plugin/urlnormalizer-host/ nutch/trunk/src/plugin/urlnormalizer-host/build.xml nutch/trunk/src/plugin/urlnormalizer-host/data/ nutch/trunk/src/plugin/urlnormalizer-host/data/hosts.txt nutch/trunk/src/plugin/urlnormalizer-host/ivy.xml nutch/trunk/src/plugin/urlnormalizer-host/plugin.xml nutch/trunk/src/plugin/urlnormalizer-host/src/ nutch/trunk/src/plugin/urlnormalizer-host/src/java/ nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/ nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/ nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/ nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/ nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/ nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/ nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java nutch/trunk/src/plugin/urlnormalizer-host/src/test/ nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/ nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/ nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/ nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/ nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/ nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/ nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java Modified: nutch/trunk/CHANGES.txt Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1349236&r1=1349235&r2=1349236&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Jun 12 10:33:18 2012 @@ -2,6 +2,8 @@ Nutch Change Log (trunk) Current Development: +* NUTCH-1319 HostNormalizer plugin (markus) + * NUTCH-1386 Headings filter not to add empty values (markus) * NUTCH-1356 ParseUtil use ExecutorService instead of manually thread handling (ferdy via markus) Added: nutch/trunk/conf/host-urlnormalizer.txt URL: http://svn.apache.org/viewvc/nutch/trunk/conf/host-urlnormalizer.txt?rev=1349236&view=auto ============================================================================== --- nutch/trunk/conf/host-urlnormalizer.txt (added) +++ nutch/trunk/conf/host-urlnormalizer.txt Tue Jun 12 10:33:18 2012 @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# New line separated list of hosts mapped to their desired targets. +# wildcard hosts are supported. Format: host target + +# Map www.apache.org to apache.org +www.apache.org apache.org + +# Map all example.org subdomains to www.example.org +*.example.org example.org Added: nutch/trunk/src/plugin/urlnormalizer-host/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/build.xml?rev=1349236&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-host/build.xml (added) +++ nutch/trunk/src/plugin/urlnormalizer-host/build.xml Tue Jun 12 10:33:18 2012 @@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlnormalizer-host" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="data" /> + </copy> +</project> Added: nutch/trunk/src/plugin/urlnormalizer-host/data/hosts.txt URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/data/hosts.txt?rev=1349236&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-host/data/hosts.txt (added) +++ nutch/trunk/src/plugin/urlnormalizer-host/data/hosts.txt Tue Jun 12 10:33:18 2012 @@ -0,0 +1,8 @@ +# Force all sub domains to www. +*.example.com example.com + +# Force no sub domain to www. URL's +www.example.net example.net + +# Force www. sub domain when hitting link without sub domain +example.org www.example.org \ No newline at end of file Added: nutch/trunk/src/plugin/urlnormalizer-host/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/ivy.xml?rev=1349236&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-host/ivy.xml (added) +++ nutch/trunk/src/plugin/urlnormalizer-host/ivy.xml Tue Jun 12 10:33:18 2012 @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="${nutch.root}/ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> Added: nutch/trunk/src/plugin/urlnormalizer-host/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/plugin.xml?rev=1349236&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-host/plugin.xml (added) +++ nutch/trunk/src/plugin/urlnormalizer-host/plugin.xml Tue Jun 12 10:33:18 2012 @@ -0,0 +1,43 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlnormalizer-host" + name="Host URL Normalizer" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlnormalizer-host.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlnormalizer.host" + name="Nutch Host URL Normalizer" + point="org.apache.nutch.net.URLNormalizer"> + <implementation id="HostURLNormalizer" + class="org.apache.nutch.net.urlnormalizer.host.HostURLNormalizer"> + <parameter name="file" value="host-urlnormalizer.txt"/> + </implementation> + </extension> + +</plugin> Added: nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java?rev=1349236&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java (added) +++ nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java Tue Jun 12 10:33:18 2012 @@ -0,0 +1,198 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.host; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashMap; + +import org.apache.commons.lang.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizer; +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.util.URLUtil; + +/** + * URL normalizer for mapping hosts to their desired form. It takes + * a simple text file as source in the format: + * + * example.org www.example.org + * + * mapping all URL's of example.org the the www sub-domain. It also + * allows for wildcards to be used to map all sub-domains to another + * host: + * + * *.example.org www.example.org + */ +public class HostURLNormalizer implements URLNormalizer { + + private Configuration conf; + + private static final Logger LOG = LoggerFactory.getLogger(HostURLNormalizer.class); + + private static String attributeFile = null; + private String hostsFile = null; + private static final HashMap<String,String> hostsMap = new HashMap<String,String>(); + + public HostURLNormalizer() {} + + public HostURLNormalizer(String hostsFile) { + this.hostsFile = hostsFile; + } + + private synchronized void readConfiguration(Reader configReader) throws IOException { + if (hostsMap.size() > 0) { + return; + } + + BufferedReader reader = new BufferedReader(configReader); + String line, host, target; + int delimiterIndex; + + while ((line = reader.readLine()) != null) { + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { + line.trim(); + delimiterIndex = line.indexOf(" "); + + host = line.substring(0, delimiterIndex); + target = line.substring(delimiterIndex + 1); + hostsMap.put(host, target); + } + } + } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + + // get the extensions for domain urlfilter + String pluginName = "urlnormalizer-host"; + Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( + URLNormalizer.class.getName()).getExtensions(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + if (extension.getDescriptor().getPluginId().equals(pluginName)) { + attributeFile = extension.getAttribute("file"); + break; + } + } + + // handle blank non empty input + if (attributeFile != null && attributeFile.trim().equals("")) { + attributeFile = null; + } + + if (attributeFile != null) { + if (LOG.isInfoEnabled()) { + LOG.info("Attribute \"file\" is defined for plugin " + pluginName + + " as " + attributeFile); + } + } + else { + if (LOG.isWarnEnabled()) { + LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " + + pluginName); + } + } + + // domain file and attribute "file" take precedence if defined + String file = conf.get("urlnormalizer.hosts.file"); + String stringRules = conf.get("urlnormalizer.hosts.rules"); + if (hostsFile != null) { + file = hostsFile; + } + else if (attributeFile != null) { + file = attributeFile; + } + Reader reader = null; + if (stringRules != null) { // takes precedence over files + reader = new StringReader(stringRules); + } else { + reader = conf.getConfResourceAsReader(file); + } + try { + if (reader == null) { + reader = new FileReader(file); + } + readConfiguration(reader); + } + catch (IOException e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } + + public String normalize(String urlString, String scope) throws MalformedURLException { + String host = new URL(urlString).getHost(); + + // Test static hosts + if (hostsMap.containsKey(host)) { + return replaceHost(urlString, host, hostsMap.get(host)); + } + + // Test for wildcard in reverse order + String[] hostParts = host.split("\\."); + + // Use a buffer for our host parts + StringBuilder hostBuffer = new StringBuilder(); + + // This is our temp buffer keeping host parts with a wildcard + String wildCardHost = new String(); + + // Add the tld to the buffer + hostBuffer.append(hostParts[hostParts.length -1]); + + for (int i = hostParts.length - 2; i > 0; i--) { + // Prepend another sub domain + hostBuffer.insert(0, hostParts[i] + "."); + + // Make a wildcarded sub domain + wildCardHost = "*." + hostBuffer.toString(); + + // Check if this wildcard sub domain exists + if (hostsMap.containsKey(wildCardHost)) { + // Replace the original input host with the wildard replaced + return replaceHost(urlString, host, hostsMap.get(wildCardHost)); + } + } + + return urlString; + } + + protected String replaceHost(String urlString, String host, String target) { + int hostIndex = urlString.indexOf(host); + + StringBuilder buffer = new StringBuilder(); + + buffer.append(urlString.substring(0, hostIndex)); + buffer.append(target); + buffer.append(urlString.substring(hostIndex + host.length())); + + return buffer.toString(); + } + +} Added: nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java?rev=1349236&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java (added) +++ nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java Tue Jun 12 10:33:18 2012 @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.host; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestHostURLNormalizer extends TestCase { + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + public void testHostURLNormalizer() throws Exception { + Configuration conf = NutchConfiguration.create(); + + String hostsFile = SAMPLES + SEPARATOR + "hosts.txt"; + HostURLNormalizer normalizer = new HostURLNormalizer(hostsFile); + normalizer.setConf(conf); + + // Force www. sub domain when hitting link without sub domain + assertEquals("http://www.example.org/page.html", normalizer.normalize("http://example.org/page.html", URLNormalizers.SCOPE_DEFAULT)); + + // Force no sub domain to www. URL's + assertEquals("http://example.net/path/to/something.html", normalizer.normalize("http://www.example.net/path/to/something.html", URLNormalizers.SCOPE_DEFAULT)); + + // Force all sub domains to www. + assertEquals("http://example.com/?does=it&still=work", normalizer.normalize("http://example.com/?does=it&still=work", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com/buh", normalizer.normalize("http://http.www.example.com/buh", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com/blaat", normalizer.normalize("http://whatever.example.com/blaat", URLNormalizers.SCOPE_DEFAULT)); + } +}