Author: markus
Date: Thu Feb 12 08:30:31 2015
New Revision: 1659167

URL: http://svn.apache.org/r1659167
Log:
NUTCH-1323 AjaxNormalizer


Added:
    nutch/trunk/src/plugin/urlnormalizer-ajax/
    nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml
    nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml
    nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml
    nutch/trunk/src/plugin/urlnormalizer-ajax/src/
    nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/
    nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/
    nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/
    nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/
    nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/
    
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/
    
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/
    
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
    nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/
    nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/
    nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/
    nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/
    nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/
    
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/
    
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/
    
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1659167&r1=1659166&r2=1659167&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Feb 12 08:30:31 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1323 AjaxNormalizer (markus)
+
 * NUTCH-1918 TikaParser specifies a default namespace when generating DOM 
(jnioche)
 
 * NUTCH-1889 Store all values from Tika metadata in Nutch metadata (jnioche)

Modified: nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1659167&r1=1659166&r2=1659167&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Thu Feb 12 08:30:31 2015
@@ -69,6 +69,7 @@
      <ant dir="urlfilter-suffix" target="deploy"/>
      <ant dir="urlfilter-validator" target="deploy"/>
      <ant dir="urlmeta" target="deploy"/>
+     <ant dir="urlnormalizer-ajax" target="deploy"/>
      <ant dir="urlnormalizer-basic" target="deploy"/>
      <ant dir="urlnormalizer-host" target="deploy"/>
      <ant dir="urlnormalizer-pass" target="deploy"/>
@@ -107,6 +108,7 @@
      <ant dir="urlfilter-regex" target="test"/>
      <ant dir="urlfilter-suffix" target="test"/>
      <ant dir="urlfilter-validator" target="test"/>
+     <ant dir="urlnormalizer-ajax" target="test"/>
      <ant dir="urlnormalizer-basic" target="test"/>
      <ant dir="urlnormalizer-host" target="test"/>
      <ant dir="urlnormalizer-pass" target="test"/>
@@ -164,8 +166,9 @@
     <ant dir="urlfilter-suffix" target="clean"/>
     <ant dir="urlfilter-validator" target="clean"/>
     <ant dir="urlmeta" target="clean"/>
-    <ant dir="urlnormalizer-host" target="clean"/>
+    <ant dir="urlnormalizer-ajax" target="clean"/>
     <ant dir="urlnormalizer-basic" target="clean"/>
+    <ant dir="urlnormalizer-host" target="clean"/>
     <ant dir="urlnormalizer-pass" target="clean"/>
     <ant dir="urlnormalizer-querystring" target="clean"/>
     <ant dir="urlnormalizer-regex" target="clean"/>

Added: nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml?rev=1659167&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml Thu Feb 12 08:30:31 2015
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlnormalizer-ajax" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Added: nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml?rev=1659167&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml Thu Feb 12 08:30:31 2015
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml?rev=1659167&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml Thu Feb 12 08:30:31 
2015
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlnormalizer-ajax"
+   name="AJAX URL Normalizer"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlnormalizer-ajax.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlnormalizer.ajax"
+              name="Nutch AJAX URL Normalizer"
+              point="org.apache.nutch.net.URLNormalizer">
+      <implementation id="AjaxURLNormalizer"
+                      
class="org.apache.nutch.net.urlnormalizer.ajax.AjaxURLNormalizer"/>
+   </extension>
+
+</plugin>

Added: 
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java?rev=1659167&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
 (added)
+++ 
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
 Thu Feb 12 08:30:31 2015
@@ -0,0 +1,236 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.ajax;
+
+import java.net.URL;
+import java.net.URI;
+import java.net.URLEncoder;
+import java.net.URLDecoder;
+import java.net.MalformedURLException;
+import java.nio.charset.Charset;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * URLNormalizer capable of dealing with AJAX URL's.
+ *
+ * Use the following regex filter to prevent escaped fragments from being 
fetched.
+ * ^(.*)\?.*_escaped_fragment_
+ */
+public class AjaxURLNormalizer implements URLNormalizer {
+  public static final Logger LOG = 
LoggerFactory.getLogger(AjaxURLNormalizer.class);
+
+  public static String AJAX_URL_PART = "#!";
+  public static String ESCAPED_URL_PART = "_escaped_fragment_=";
+
+  private Configuration conf;
+  private Charset utf8;
+
+  /**
+   * Default constructor.
+   */
+  public AjaxURLNormalizer() {
+    utf8 = Charset.forName("UTF-8");
+  }
+
+  /**
+   * Attempts to normalize the input URL string
+   *
+   * @param String urlString
+   * @return String
+   */
+  public String normalize(String urlString, String scope) throws 
MalformedURLException {
+    LOG.info(scope + " // " + urlString);
+  
+    // When indexing, transform _escaped_fragment_ URL's to their #! 
counterpart
+    if (scope.equals(URLNormalizers.SCOPE_INDEXER) && 
urlString.contains(ESCAPED_URL_PART)) {
+      return normalizeEscapedFragment(urlString);
+    }
+    
+    // Otherwise transform #! URL's to their _escaped_fragment_ counterpart
+    if (urlString.contains(AJAX_URL_PART)) {
+      LOG.info(scope + " // " + normalizeHashedFragment(urlString));
+      return normalizeHashedFragment(urlString);
+    }
+
+    // Nothing to normalize here, return verbatim
+    return urlString;
+  }
+
+  /**
+   * Returns a normalized input URL. #! querystrings are transformed
+   * to a _escaped_fragment_ form.
+   *
+   * @param String urlString
+   * @return String
+   */
+  protected String normalizeHashedFragment(String urlString) throws 
MalformedURLException {
+    URL u = new URL(urlString);
+    int pos = urlString.indexOf(AJAX_URL_PART);
+    StringBuilder sb = new StringBuilder(urlString.substring(0, pos));
+
+    // Get the escaped fragment
+    String escapedFragment = escape(urlString.substring(pos + 
AJAX_URL_PART.length()));
+
+    // Check if we already have a query in the URL
+    if (u.getQuery() == null) {
+      sb.append("?");
+    } else {
+      sb.append("&");
+    }
+
+    // Append the escaped fragment key and the value
+    sb.append(ESCAPED_URL_PART);
+    sb.append(escapedFragment);
+
+    return sb.toString();
+  }
+
+  /**
+   * Returns a normalized input URL. _escaped_fragment_ querystrings are
+   * transformed to a #! form.
+   *
+   * @param String urlString
+   * @return String
+   */
+  protected String normalizeEscapedFragment(String urlString) throws 
MalformedURLException {
+    int pos = urlString.indexOf(ESCAPED_URL_PART);
+    URL u = new URL(urlString);
+    StringBuilder sb = new StringBuilder();
+
+    // Write the URL without query string, we'll handle that later
+    sb.append(u.getProtocol());
+    sb.append("://");
+    sb.append(u.getHost());
+    if (u.getPort() != -1) {
+      sb.append(":");
+      sb.append(u.getPort());
+    }
+    sb.append(u.getPath());
+
+    // Get the query string
+    String queryString = u.getQuery();
+
+    // Check if there's an & in the query string
+    int ampPos = queryString.indexOf("&");
+    String keyValuePair = null;
+
+    // If there's none, then the escaped fragment is the only k/v pair
+    if (ampPos == -1) {
+      keyValuePair = queryString;
+      queryString = "";
+    } else {
+      // Obtain the escaped k/v pair
+      keyValuePair = queryString.substring(ampPos + 1);
+
+      // Remove the escaped fragment key/value pair from the query string
+      queryString = queryString.replaceFirst("&" + keyValuePair, "");
+    }
+
+    // Remove escapedUrlPart from the keyValuePair
+    keyValuePair = keyValuePair.replaceFirst(ESCAPED_URL_PART, "");
+
+    // Get the fragment escaped
+    String unescapedFragment = unescape(keyValuePair);
+
+    // Append a possible query string, without original escaped fragment
+    if (queryString.length() > 0) {
+      sb.append("?");
+      sb.append(queryString);
+    }
+
+    // Append the fragment delimiter and the unescaped fragment
+    sb.append("#!");
+    sb.append(unescapedFragment);
+
+    return sb.toString();
+  }
+
+  /**
+   * Unescape some exotic characters in the fragment part
+   *
+   * @param String fragmentPart
+   * @return String
+   */
+  protected String unescape(String fragmentPart) {
+    try {
+      fragmentPart = URLDecoder.decode(fragmentPart, "UTF-8");
+    } catch (Exception e) {
+      /// bluh
+    }
+
+    return fragmentPart;
+  }
+
+  /**
+   * Escape some exotic characters in the fragment part
+   *
+   * @param String fragmentPart
+   * @return String
+   */
+  protected String escape(String fragmentPart) {
+    String hex = null;
+    StringBuilder sb = new StringBuilder(fragmentPart.length());
+
+    for (byte b : fragmentPart.getBytes(utf8)) {
+      if (b < 33) {
+        sb.append('%');
+
+        hex = Integer.toHexString(b & 0xFF).toUpperCase();
+
+        // Prevent odd # chars
+        if (hex.length() % 2 != 0) {
+          sb.append('0');
+        }
+        sb.append(hex);
+      } else if (b == 35) {
+        sb.append("%23");
+      } else if (b == 37) {
+        sb.append("%25");
+      } else if (b == 38) {
+        sb.append("%26");
+      } else if (b == 43) {
+        sb.append("%2B");
+      } else {
+        sb.append((char)b);
+      }
+    }
+
+    return sb.toString();
+  }
+
+  /**
+   * @param Configuration conf
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  /**
+   * @return Configuration
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+}
\ No newline at end of file

Added: 
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java?rev=1659167&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
 (added)
+++ 
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
 Thu Feb 12 08:30:31 2015
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.ajax;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+/** Unit tests for AjaxURLNormalizer. */
+public class TestAjaxURLNormalizer extends TestCase {
+  private AjaxURLNormalizer normalizer;
+  private Configuration conf;
+  
+  public TestAjaxURLNormalizer(String name) {
+    super(name);
+    normalizer = new AjaxURLNormalizer();
+    conf = NutchConfiguration.create();
+    normalizer.setConf(conf);
+  }
+
+  public void testNormalizer() throws Exception {
+    // check if AJAX URL's are normalized to an _escaped_frament_ form
+    normalizeTest("http://example.org/#!k=v";, 
"http://example.org/?_escaped_fragment_=k=v";);
+
+    // Check with some escaped chars
+    normalizeTest("http://example.org/#!k=v&something=is wrong", 
"http://example.org/?_escaped_fragment_=k=v%26something=is%20wrong";);
+
+    // Check with query string and multiple fragment params
+    
normalizeTest("http://example.org/path.html?queryparam=queryvalue#!key1=value1&key2=value2";,
 
"http://example.org/path.html?queryparam=queryvalue&_escaped_fragment_=key1=value1%26key2=value2";);
+  }
+  
+  public void testNormalizerWhenIndexing() throws Exception {
+    // check if it works the other way around
+    normalizeTest("http://example.org/?_escaped_fragment_=key=value";, 
"http://example.org/#!key=value";, URLNormalizers.SCOPE_INDEXER);
+    
normalizeTest("http://example.org/?key=value&_escaped_fragment_=key=value";, 
"http://example.org/?key=value#!key=value";, URLNormalizers.SCOPE_INDEXER);
+    
normalizeTest("http://example.org/page.html?key=value&_escaped_fragment_=key=value%26something=is%20wrong";,
 "http://example.org/page.html?key=value#!key=value&something=is wrong", 
URLNormalizers.SCOPE_INDEXER);
+  }
+
+  private void normalizeTest(String weird, String normal) throws Exception {
+    assertEquals(normal, normalizer.normalize(weird, 
URLNormalizers.SCOPE_DEFAULT));
+  }
+  
+  private void normalizeTest(String weird, String normal, String scope) throws 
Exception {
+    assertEquals(normal, normalizer.normalize(weird, scope));
+  }
+
+  public static void main(String[] args) throws Exception {
+    new TestAjaxURLNormalizer("test").testNormalizer();
+  }
+}
\ No newline at end of file


Reply via email to