Author: markus
Date: Thu Feb 12 08:30:31 2015
New Revision: 1659167
URL: http://svn.apache.org/r1659167
Log:
NUTCH-1323 AjaxNormalizer
Added:
nutch/trunk/src/plugin/urlnormalizer-ajax/
nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml
nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml
nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml
nutch/trunk/src/plugin/urlnormalizer-ajax/src/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/build.xml
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1659167&r1=1659166&r2=1659167&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Feb 12 08:30:31 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1323 AjaxNormalizer (markus)
+
* NUTCH-1918 TikaParser specifies a default namespace when generating DOM
(jnioche)
* NUTCH-1889 Store all values from Tika metadata in Nutch metadata (jnioche)
Modified: nutch/trunk/src/plugin/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1659167&r1=1659166&r2=1659167&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Thu Feb 12 08:30:31 2015
@@ -69,6 +69,7 @@
<ant dir="urlfilter-suffix" target="deploy"/>
<ant dir="urlfilter-validator" target="deploy"/>
<ant dir="urlmeta" target="deploy"/>
+ <ant dir="urlnormalizer-ajax" target="deploy"/>
<ant dir="urlnormalizer-basic" target="deploy"/>
<ant dir="urlnormalizer-host" target="deploy"/>
<ant dir="urlnormalizer-pass" target="deploy"/>
@@ -107,6 +108,7 @@
<ant dir="urlfilter-regex" target="test"/>
<ant dir="urlfilter-suffix" target="test"/>
<ant dir="urlfilter-validator" target="test"/>
+ <ant dir="urlnormalizer-ajax" target="test"/>
<ant dir="urlnormalizer-basic" target="test"/>
<ant dir="urlnormalizer-host" target="test"/>
<ant dir="urlnormalizer-pass" target="test"/>
@@ -164,8 +166,9 @@
<ant dir="urlfilter-suffix" target="clean"/>
<ant dir="urlfilter-validator" target="clean"/>
<ant dir="urlmeta" target="clean"/>
- <ant dir="urlnormalizer-host" target="clean"/>
+ <ant dir="urlnormalizer-ajax" target="clean"/>
<ant dir="urlnormalizer-basic" target="clean"/>
+ <ant dir="urlnormalizer-host" target="clean"/>
<ant dir="urlnormalizer-pass" target="clean"/>
<ant dir="urlnormalizer-querystring" target="clean"/>
<ant dir="urlnormalizer-regex" target="clean"/>
Added: nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml?rev=1659167&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml Thu Feb 12 08:30:31 2015
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlnormalizer-ajax" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
Added: nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml?rev=1659167&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml Thu Feb 12 08:30:31 2015
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
Added: nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml?rev=1659167&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml Thu Feb 12 08:30:31
2015
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlnormalizer-ajax"
+ name="AJAX URL Normalizer"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="urlnormalizer-ajax.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlnormalizer.ajax"
+ name="Nutch AJAX URL Normalizer"
+ point="org.apache.nutch.net.URLNormalizer">
+ <implementation id="AjaxURLNormalizer"
+
class="org.apache.nutch.net.urlnormalizer.ajax.AjaxURLNormalizer"/>
+ </extension>
+
+</plugin>
Added:
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java?rev=1659167&view=auto
==============================================================================
---
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
(added)
+++
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
Thu Feb 12 08:30:31 2015
@@ -0,0 +1,236 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.ajax;
+
+import java.net.URL;
+import java.net.URI;
+import java.net.URLEncoder;
+import java.net.URLDecoder;
+import java.net.MalformedURLException;
+import java.nio.charset.Charset;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * URLNormalizer capable of dealing with AJAX URL's.
+ *
+ * Use the following regex filter to prevent escaped fragments from being
fetched.
+ * ^(.*)\?.*_escaped_fragment_
+ */
+public class AjaxURLNormalizer implements URLNormalizer {
+ public static final Logger LOG =
LoggerFactory.getLogger(AjaxURLNormalizer.class);
+
+ public static String AJAX_URL_PART = "#!";
+ public static String ESCAPED_URL_PART = "_escaped_fragment_=";
+
+ private Configuration conf;
+ private Charset utf8;
+
+ /**
+ * Default constructor.
+ */
+ public AjaxURLNormalizer() {
+ utf8 = Charset.forName("UTF-8");
+ }
+
+ /**
+ * Attempts to normalize the input URL string
+ *
+ * @param String urlString
+ * @return String
+ */
+ public String normalize(String urlString, String scope) throws
MalformedURLException {
+ LOG.info(scope + " // " + urlString);
+
+ // When indexing, transform _escaped_fragment_ URL's to their #!
counterpart
+ if (scope.equals(URLNormalizers.SCOPE_INDEXER) &&
urlString.contains(ESCAPED_URL_PART)) {
+ return normalizeEscapedFragment(urlString);
+ }
+
+ // Otherwise transform #! URL's to their _escaped_fragment_ counterpart
+ if (urlString.contains(AJAX_URL_PART)) {
+ LOG.info(scope + " // " + normalizeHashedFragment(urlString));
+ return normalizeHashedFragment(urlString);
+ }
+
+ // Nothing to normalize here, return verbatim
+ return urlString;
+ }
+
+ /**
+ * Returns a normalized input URL. #! querystrings are transformed
+ * to a _escaped_fragment_ form.
+ *
+ * @param String urlString
+ * @return String
+ */
+ protected String normalizeHashedFragment(String urlString) throws
MalformedURLException {
+ URL u = new URL(urlString);
+ int pos = urlString.indexOf(AJAX_URL_PART);
+ StringBuilder sb = new StringBuilder(urlString.substring(0, pos));
+
+ // Get the escaped fragment
+ String escapedFragment = escape(urlString.substring(pos +
AJAX_URL_PART.length()));
+
+ // Check if we already have a query in the URL
+ if (u.getQuery() == null) {
+ sb.append("?");
+ } else {
+ sb.append("&");
+ }
+
+ // Append the escaped fragment key and the value
+ sb.append(ESCAPED_URL_PART);
+ sb.append(escapedFragment);
+
+ return sb.toString();
+ }
+
+ /**
+ * Returns a normalized input URL. _escaped_fragment_ querystrings are
+ * transformed to a #! form.
+ *
+ * @param String urlString
+ * @return String
+ */
+ protected String normalizeEscapedFragment(String urlString) throws
MalformedURLException {
+ int pos = urlString.indexOf(ESCAPED_URL_PART);
+ URL u = new URL(urlString);
+ StringBuilder sb = new StringBuilder();
+
+ // Write the URL without query string, we'll handle that later
+ sb.append(u.getProtocol());
+ sb.append("://");
+ sb.append(u.getHost());
+ if (u.getPort() != -1) {
+ sb.append(":");
+ sb.append(u.getPort());
+ }
+ sb.append(u.getPath());
+
+ // Get the query string
+ String queryString = u.getQuery();
+
+ // Check if there's an & in the query string
+ int ampPos = queryString.indexOf("&");
+ String keyValuePair = null;
+
+ // If there's none, then the escaped fragment is the only k/v pair
+ if (ampPos == -1) {
+ keyValuePair = queryString;
+ queryString = "";
+ } else {
+ // Obtain the escaped k/v pair
+ keyValuePair = queryString.substring(ampPos + 1);
+
+ // Remove the escaped fragment key/value pair from the query string
+ queryString = queryString.replaceFirst("&" + keyValuePair, "");
+ }
+
+ // Remove escapedUrlPart from the keyValuePair
+ keyValuePair = keyValuePair.replaceFirst(ESCAPED_URL_PART, "");
+
+ // Get the fragment escaped
+ String unescapedFragment = unescape(keyValuePair);
+
+ // Append a possible query string, without original escaped fragment
+ if (queryString.length() > 0) {
+ sb.append("?");
+ sb.append(queryString);
+ }
+
+ // Append the fragment delimiter and the unescaped fragment
+ sb.append("#!");
+ sb.append(unescapedFragment);
+
+ return sb.toString();
+ }
+
+ /**
+ * Unescape some exotic characters in the fragment part
+ *
+ * @param String fragmentPart
+ * @return String
+ */
+ protected String unescape(String fragmentPart) {
+ try {
+ fragmentPart = URLDecoder.decode(fragmentPart, "UTF-8");
+ } catch (Exception e) {
+ /// bluh
+ }
+
+ return fragmentPart;
+ }
+
+ /**
+ * Escape some exotic characters in the fragment part
+ *
+ * @param String fragmentPart
+ * @return String
+ */
+ protected String escape(String fragmentPart) {
+ String hex = null;
+ StringBuilder sb = new StringBuilder(fragmentPart.length());
+
+ for (byte b : fragmentPart.getBytes(utf8)) {
+ if (b < 33) {
+ sb.append('%');
+
+ hex = Integer.toHexString(b & 0xFF).toUpperCase();
+
+ // Prevent odd # chars
+ if (hex.length() % 2 != 0) {
+ sb.append('0');
+ }
+ sb.append(hex);
+ } else if (b == 35) {
+ sb.append("%23");
+ } else if (b == 37) {
+ sb.append("%25");
+ } else if (b == 38) {
+ sb.append("%26");
+ } else if (b == 43) {
+ sb.append("%2B");
+ } else {
+ sb.append((char)b);
+ }
+ }
+
+ return sb.toString();
+ }
+
+ /**
+ * @param Configuration conf
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ /**
+ * @return Configuration
+ */
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+}
\ No newline at end of file
Added:
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java?rev=1659167&view=auto
==============================================================================
---
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
(added)
+++
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
Thu Feb 12 08:30:31 2015
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.ajax;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+/** Unit tests for AjaxURLNormalizer. */
+public class TestAjaxURLNormalizer extends TestCase {
+ private AjaxURLNormalizer normalizer;
+ private Configuration conf;
+
+ public TestAjaxURLNormalizer(String name) {
+ super(name);
+ normalizer = new AjaxURLNormalizer();
+ conf = NutchConfiguration.create();
+ normalizer.setConf(conf);
+ }
+
+ public void testNormalizer() throws Exception {
+ // check if AJAX URL's are normalized to an _escaped_frament_ form
+ normalizeTest("http://example.org/#!k=v",
"http://example.org/?_escaped_fragment_=k=v");
+
+ // Check with some escaped chars
+ normalizeTest("http://example.org/#!k=v&something=is wrong",
"http://example.org/?_escaped_fragment_=k=v%26something=is%20wrong");
+
+ // Check with query string and multiple fragment params
+
normalizeTest("http://example.org/path.html?queryparam=queryvalue#!key1=value1&key2=value2",
"http://example.org/path.html?queryparam=queryvalue&_escaped_fragment_=key1=value1%26key2=value2");
+ }
+
+ public void testNormalizerWhenIndexing() throws Exception {
+ // check if it works the other way around
+ normalizeTest("http://example.org/?_escaped_fragment_=key=value",
"http://example.org/#!key=value", URLNormalizers.SCOPE_INDEXER);
+
normalizeTest("http://example.org/?key=value&_escaped_fragment_=key=value",
"http://example.org/?key=value#!key=value", URLNormalizers.SCOPE_INDEXER);
+
normalizeTest("http://example.org/page.html?key=value&_escaped_fragment_=key=value%26something=is%20wrong",
"http://example.org/page.html?key=value#!key=value&something=is wrong",
URLNormalizers.SCOPE_INDEXER);
+ }
+
+ private void normalizeTest(String weird, String normal) throws Exception {
+ assertEquals(normal, normalizer.normalize(weird,
URLNormalizers.SCOPE_DEFAULT));
+ }
+
+ private void normalizeTest(String weird, String normal, String scope) throws
Exception {
+ assertEquals(normal, normalizer.normalize(weird, scope));
+ }
+
+ public static void main(String[] args) throws Exception {
+ new TestAjaxURLNormalizer("test").testNormalizer();
+ }
+}
\ No newline at end of file