Author: markus
Date: Tue Jul  2 08:37:40 2013
New Revision: 1498832

URL: http://svn.apache.org/r1498832
Log:
NUTCH-1581 CrawlDB csv output to include metadata

Added:
    nutch/trunk/src/plugin/urlnormalizer-querystring/
    nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml
    nutch/trunk/src/plugin/urlnormalizer-querystring/ivy.xml
    nutch/trunk/src/plugin/urlnormalizer-querystring/plugin.xml
    nutch/trunk/src/plugin/urlnormalizer-querystring/src/
    nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/
    nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/
    nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/
    nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/
    
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/
    
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/
    
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/
    
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
    nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/
    nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/
    nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/
    nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/
    
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/
    
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/
    
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/
    
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1498832&r1=1498831&r2=1498832&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jul  2 08:37:40 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1581 CrawlDB csv output to include metadata (markus)
+
 * NUTCH-1327 QueryStringNormalizer (markus)
 
 * NUTCH-1593 Normalize option missing in SegmentMerger's usage (markus)

Modified: nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1498832&r1=1498831&r2=1498832&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Tue Jul  2 08:37:40 2013
@@ -70,6 +70,7 @@
      <ant dir="urlnormalizer-basic" target="deploy"/>
      <ant dir="urlnormalizer-host" target="deploy"/>
      <ant dir="urlnormalizer-pass" target="deploy"/>
+     <ant dir="urlnormalizer-querystring" target="deploy"/>
      <ant dir="urlnormalizer-regex" target="deploy"/>
   </target>
 
@@ -105,6 +106,7 @@
      <ant dir="urlnormalizer-basic" target="test"/>
      <ant dir="urlnormalizer-host" target="test"/>
      <ant dir="urlnormalizer-pass" target="test"/>
+     <ant dir="urlnormalizer-querystring" target="test"/>
      <ant dir="urlnormalizer-regex" target="test"/>
     </parallel>
   </target>
@@ -159,6 +161,7 @@
     <ant dir="urlnormalizer-host" target="clean"/>
     <ant dir="urlnormalizer-basic" target="clean"/>
     <ant dir="urlnormalizer-pass" target="clean"/>
+    <ant dir="urlnormalizer-querystring" target="clean"/>
     <ant dir="urlnormalizer-regex" target="clean"/>
   </target>
 </project>

Added: nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml?rev=1498832&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml Tue Jul  2 
08:37:40 2013
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlnormalizer-querystring" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Added: nutch/trunk/src/plugin/urlnormalizer-querystring/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/ivy.xml?rev=1498832&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-querystring/ivy.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-querystring/ivy.xml Tue Jul  2 
08:37:40 2013
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/urlnormalizer-querystring/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/plugin.xml?rev=1498832&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-querystring/plugin.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-querystring/plugin.xml Tue Jul  2 
08:37:40 2013
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlnormalizer-querystring"
+   name="Querystrings URL Normalizer"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlnormalizer-querystring.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlnormalizer.querystring"
+              name="Nutch Querystring URL Normalizer"
+              point="org.apache.nutch.net.URLNormalizer">
+      <implementation id="QuerystringURLNormalizer"
+                      
class="org.apache.nutch.net.urlnormalizer.querystring.QuerystringURLNormalizer">
+      </implementation>
+   </extension>
+
+</plugin>

Added: 
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java?rev=1498832&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
 (added)
+++ 
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
 Tue Jul  2 08:37:40 2013
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.querystring;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * URL normalizer plugin for normalizing query strings but sorting
+ * query string parameters. Not sorting query strings can lead to large
+ * amounts of duplicate URL's such as ?a=x&b=y vs b=y&a=x.
+ *
+ */
+public class QuerystringURLNormalizer implements URLNormalizer {
+
+  private Configuration conf;
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(QuerystringURLNormalizer.class);
+
+  public QuerystringURLNormalizer() {}
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public String normalize(String urlString, String scope) throws 
MalformedURLException {
+    URL url = new URL(urlString);
+    
+    String queryString = url.getQuery();
+    
+    if (queryString == null) {
+      return urlString;
+    }
+    
+    List<String> queryStringParts = Arrays.asList(queryString.split("&"));
+    Collections.sort(queryStringParts);
+    
+    StringBuilder sb = new StringBuilder();
+    
+    sb.append(url.getProtocol());
+    sb.append("://");
+    sb.append(url.getHost());
+    if (url.getPort() > -1) {
+      sb.append(":");
+      sb.append(url.getPort());
+    }
+    sb.append(url.getPath());
+    sb.append("?");
+    sb.append(StringUtils.join(queryStringParts, "&"));
+    if (url.getRef() != null) {
+      sb.append("#");
+      sb.append(url.getRef());
+    }
+    
+    return sb.toString();
+  }
+}

Added: 
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java?rev=1498832&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
 (added)
+++ 
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
 Tue Jul  2 08:37:40 2013
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.querystring;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestQuerystringURLNormalizer extends TestCase {
+
+  public void testQuerystringURLNormalizer() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+
+    QuerystringURLNormalizer normalizer = new QuerystringURLNormalizer();
+    normalizer.setConf(conf);
+    
+    assertEquals("http://example.com/?a=b&c=d";, 
normalizer.normalize("http://example.com/?c=d&a=b";, 
URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.com/a/b/c";, 
normalizer.normalize("http://example.com/a/b/c";, URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.com:1234/a/b/c";, 
normalizer.normalize("http://example.com:1234/a/b/c";, 
URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.com:1234/a/b/c#ref";, 
normalizer.normalize("http://example.com:1234/a/b/c#ref";, 
URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.com:1234/a/b/c?a=b&c=d#ref";, 
normalizer.normalize("http://example.com:1234/a/b/c?c=d&a=b#ref";, 
URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.com/?a=b&a=c&c=d";, 
normalizer.normalize("http://example.com/?c=d&a=b&a=c";, 
URLNormalizers.SCOPE_DEFAULT));
+  }
+}


Reply via email to