Author: markus Date: Thu Feb 12 08:30:31 2015 New Revision: 1659167 URL: http://svn.apache.org/r1659167 Log: NUTCH-1323 AjaxNormalizer
Added: nutch/trunk/src/plugin/urlnormalizer-ajax/ nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml nutch/trunk/src/plugin/urlnormalizer-ajax/src/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/ nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/build.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1659167&r1=1659166&r2=1659167&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Feb 12 08:30:31 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1323 AjaxNormalizer (markus) + * NUTCH-1918 TikaParser specifies a default namespace when generating DOM (jnioche) * NUTCH-1889 Store all values from Tika metadata in Nutch metadata (jnioche) Modified: nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1659167&r1=1659166&r2=1659167&view=diff ============================================================================== --- nutch/trunk/src/plugin/build.xml (original) +++ nutch/trunk/src/plugin/build.xml Thu Feb 12 08:30:31 2015 @@ -69,6 +69,7 @@ <ant dir="urlfilter-suffix" target="deploy"/> <ant dir="urlfilter-validator" target="deploy"/> <ant dir="urlmeta" target="deploy"/> + <ant dir="urlnormalizer-ajax" target="deploy"/> <ant dir="urlnormalizer-basic" target="deploy"/> <ant dir="urlnormalizer-host" target="deploy"/> <ant dir="urlnormalizer-pass" target="deploy"/> @@ -107,6 +108,7 @@ <ant dir="urlfilter-regex" target="test"/> <ant dir="urlfilter-suffix" target="test"/> <ant dir="urlfilter-validator" target="test"/> + <ant dir="urlnormalizer-ajax" target="test"/> <ant dir="urlnormalizer-basic" target="test"/> <ant dir="urlnormalizer-host" target="test"/> <ant dir="urlnormalizer-pass" target="test"/> @@ -164,8 +166,9 @@ <ant dir="urlfilter-suffix" target="clean"/> <ant dir="urlfilter-validator" target="clean"/> <ant dir="urlmeta" target="clean"/> - <ant dir="urlnormalizer-host" target="clean"/> + <ant dir="urlnormalizer-ajax" target="clean"/> <ant dir="urlnormalizer-basic" target="clean"/> + <ant dir="urlnormalizer-host" target="clean"/> <ant dir="urlnormalizer-pass" target="clean"/> <ant dir="urlnormalizer-querystring" target="clean"/> <ant dir="urlnormalizer-regex" target="clean"/> Added: nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml?rev=1659167&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml (added) +++ nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml Thu Feb 12 08:30:31 2015 @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlnormalizer-ajax" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Added: nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml?rev=1659167&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml (added) +++ nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml Thu Feb 12 08:30:31 2015 @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> Added: nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml?rev=1659167&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml (added) +++ nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml Thu Feb 12 08:30:31 2015 @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlnormalizer-ajax" + name="AJAX URL Normalizer" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlnormalizer-ajax.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlnormalizer.ajax" + name="Nutch AJAX URL Normalizer" + point="org.apache.nutch.net.URLNormalizer"> + <implementation id="AjaxURLNormalizer" + class="org.apache.nutch.net.urlnormalizer.ajax.AjaxURLNormalizer"/> + </extension> + +</plugin> Added: nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java?rev=1659167&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java (added) +++ nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java Thu Feb 12 08:30:31 2015 @@ -0,0 +1,236 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.ajax; + +import java.net.URL; +import java.net.URI; +import java.net.URLEncoder; +import java.net.URLDecoder; +import java.net.MalformedURLException; +import java.nio.charset.Charset; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.nutch.net.URLNormalizer; +import org.apache.nutch.net.URLNormalizers; +import org.apache.hadoop.conf.Configuration; + +/** + * URLNormalizer capable of dealing with AJAX URL's. + * + * Use the following regex filter to prevent escaped fragments from being fetched. + * ^(.*)\?.*_escaped_fragment_ + */ +public class AjaxURLNormalizer implements URLNormalizer { + public static final Logger LOG = LoggerFactory.getLogger(AjaxURLNormalizer.class); + + public static String AJAX_URL_PART = "#!"; + public static String ESCAPED_URL_PART = "_escaped_fragment_="; + + private Configuration conf; + private Charset utf8; + + /** + * Default constructor. + */ + public AjaxURLNormalizer() { + utf8 = Charset.forName("UTF-8"); + } + + /** + * Attempts to normalize the input URL string + * + * @param String urlString + * @return String + */ + public String normalize(String urlString, String scope) throws MalformedURLException { + LOG.info(scope + " // " + urlString); + + // When indexing, transform _escaped_fragment_ URL's to their #! counterpart + if (scope.equals(URLNormalizers.SCOPE_INDEXER) && urlString.contains(ESCAPED_URL_PART)) { + return normalizeEscapedFragment(urlString); + } + + // Otherwise transform #! URL's to their _escaped_fragment_ counterpart + if (urlString.contains(AJAX_URL_PART)) { + LOG.info(scope + " // " + normalizeHashedFragment(urlString)); + return normalizeHashedFragment(urlString); + } + + // Nothing to normalize here, return verbatim + return urlString; + } + + /** + * Returns a normalized input URL. #! querystrings are transformed + * to a _escaped_fragment_ form. + * + * @param String urlString + * @return String + */ + protected String normalizeHashedFragment(String urlString) throws MalformedURLException { + URL u = new URL(urlString); + int pos = urlString.indexOf(AJAX_URL_PART); + StringBuilder sb = new StringBuilder(urlString.substring(0, pos)); + + // Get the escaped fragment + String escapedFragment = escape(urlString.substring(pos + AJAX_URL_PART.length())); + + // Check if we already have a query in the URL + if (u.getQuery() == null) { + sb.append("?"); + } else { + sb.append("&"); + } + + // Append the escaped fragment key and the value + sb.append(ESCAPED_URL_PART); + sb.append(escapedFragment); + + return sb.toString(); + } + + /** + * Returns a normalized input URL. _escaped_fragment_ querystrings are + * transformed to a #! form. + * + * @param String urlString + * @return String + */ + protected String normalizeEscapedFragment(String urlString) throws MalformedURLException { + int pos = urlString.indexOf(ESCAPED_URL_PART); + URL u = new URL(urlString); + StringBuilder sb = new StringBuilder(); + + // Write the URL without query string, we'll handle that later + sb.append(u.getProtocol()); + sb.append("://"); + sb.append(u.getHost()); + if (u.getPort() != -1) { + sb.append(":"); + sb.append(u.getPort()); + } + sb.append(u.getPath()); + + // Get the query string + String queryString = u.getQuery(); + + // Check if there's an & in the query string + int ampPos = queryString.indexOf("&"); + String keyValuePair = null; + + // If there's none, then the escaped fragment is the only k/v pair + if (ampPos == -1) { + keyValuePair = queryString; + queryString = ""; + } else { + // Obtain the escaped k/v pair + keyValuePair = queryString.substring(ampPos + 1); + + // Remove the escaped fragment key/value pair from the query string + queryString = queryString.replaceFirst("&" + keyValuePair, ""); + } + + // Remove escapedUrlPart from the keyValuePair + keyValuePair = keyValuePair.replaceFirst(ESCAPED_URL_PART, ""); + + // Get the fragment escaped + String unescapedFragment = unescape(keyValuePair); + + // Append a possible query string, without original escaped fragment + if (queryString.length() > 0) { + sb.append("?"); + sb.append(queryString); + } + + // Append the fragment delimiter and the unescaped fragment + sb.append("#!"); + sb.append(unescapedFragment); + + return sb.toString(); + } + + /** + * Unescape some exotic characters in the fragment part + * + * @param String fragmentPart + * @return String + */ + protected String unescape(String fragmentPart) { + try { + fragmentPart = URLDecoder.decode(fragmentPart, "UTF-8"); + } catch (Exception e) { + /// bluh + } + + return fragmentPart; + } + + /** + * Escape some exotic characters in the fragment part + * + * @param String fragmentPart + * @return String + */ + protected String escape(String fragmentPart) { + String hex = null; + StringBuilder sb = new StringBuilder(fragmentPart.length()); + + for (byte b : fragmentPart.getBytes(utf8)) { + if (b < 33) { + sb.append('%'); + + hex = Integer.toHexString(b & 0xFF).toUpperCase(); + + // Prevent odd # chars + if (hex.length() % 2 != 0) { + sb.append('0'); + } + sb.append(hex); + } else if (b == 35) { + sb.append("%23"); + } else if (b == 37) { + sb.append("%25"); + } else if (b == 38) { + sb.append("%26"); + } else if (b == 43) { + sb.append("%2B"); + } else { + sb.append((char)b); + } + } + + return sb.toString(); + } + + /** + * @param Configuration conf + */ + public void setConf(Configuration conf) { + this.conf = conf; + } + + /** + * @return Configuration + */ + public Configuration getConf() { + return this.conf; + } + +} \ No newline at end of file Added: nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java?rev=1659167&view=auto ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java (added) +++ nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java Thu Feb 12 08:30:31 2015 @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.ajax; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +/** Unit tests for AjaxURLNormalizer. */ +public class TestAjaxURLNormalizer extends TestCase { + private AjaxURLNormalizer normalizer; + private Configuration conf; + + public TestAjaxURLNormalizer(String name) { + super(name); + normalizer = new AjaxURLNormalizer(); + conf = NutchConfiguration.create(); + normalizer.setConf(conf); + } + + public void testNormalizer() throws Exception { + // check if AJAX URL's are normalized to an _escaped_frament_ form + normalizeTest("http://example.org/#!k=v", "http://example.org/?_escaped_fragment_=k=v"); + + // Check with some escaped chars + normalizeTest("http://example.org/#!k=v&something=is wrong", "http://example.org/?_escaped_fragment_=k=v%26something=is%20wrong"); + + // Check with query string and multiple fragment params + normalizeTest("http://example.org/path.html?queryparam=queryvalue#!key1=value1&key2=value2", "http://example.org/path.html?queryparam=queryvalue&_escaped_fragment_=key1=value1%26key2=value2"); + } + + public void testNormalizerWhenIndexing() throws Exception { + // check if it works the other way around + normalizeTest("http://example.org/?_escaped_fragment_=key=value", "http://example.org/#!key=value", URLNormalizers.SCOPE_INDEXER); + normalizeTest("http://example.org/?key=value&_escaped_fragment_=key=value", "http://example.org/?key=value#!key=value", URLNormalizers.SCOPE_INDEXER); + normalizeTest("http://example.org/page.html?key=value&_escaped_fragment_=key=value%26something=is%20wrong", "http://example.org/page.html?key=value#!key=value&something=is wrong", URLNormalizers.SCOPE_INDEXER); + } + + private void normalizeTest(String weird, String normal) throws Exception { + assertEquals(normal, normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT)); + } + + private void normalizeTest(String weird, String normal, String scope) throws Exception { + assertEquals(normal, normalizer.normalize(weird, scope)); + } + + public static void main(String[] args) throws Exception { + new TestAjaxURLNormalizer("test").testNormalizer(); + } +} \ No newline at end of file