Author: markus
Date: Tue Jul 2 08:37:40 2013
New Revision: 1498832
URL: http://svn.apache.org/r1498832
Log:
NUTCH-1581 CrawlDB csv output to include metadata
Added:
nutch/trunk/src/plugin/urlnormalizer-querystring/
nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml
nutch/trunk/src/plugin/urlnormalizer-querystring/ivy.xml
nutch/trunk/src/plugin/urlnormalizer-querystring/plugin.xml
nutch/trunk/src/plugin/urlnormalizer-querystring/src/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/build.xml
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1498832&r1=1498831&r2=1498832&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jul 2 08:37:40 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1581 CrawlDB csv output to include metadata (markus)
+
* NUTCH-1327 QueryStringNormalizer (markus)
* NUTCH-1593 Normalize option missing in SegmentMerger's usage (markus)
Modified: nutch/trunk/src/plugin/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1498832&r1=1498831&r2=1498832&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Tue Jul 2 08:37:40 2013
@@ -70,6 +70,7 @@
<ant dir="urlnormalizer-basic" target="deploy"/>
<ant dir="urlnormalizer-host" target="deploy"/>
<ant dir="urlnormalizer-pass" target="deploy"/>
+ <ant dir="urlnormalizer-querystring" target="deploy"/>
<ant dir="urlnormalizer-regex" target="deploy"/>
</target>
@@ -105,6 +106,7 @@
<ant dir="urlnormalizer-basic" target="test"/>
<ant dir="urlnormalizer-host" target="test"/>
<ant dir="urlnormalizer-pass" target="test"/>
+ <ant dir="urlnormalizer-querystring" target="test"/>
<ant dir="urlnormalizer-regex" target="test"/>
</parallel>
</target>
@@ -159,6 +161,7 @@
<ant dir="urlnormalizer-host" target="clean"/>
<ant dir="urlnormalizer-basic" target="clean"/>
<ant dir="urlnormalizer-pass" target="clean"/>
+ <ant dir="urlnormalizer-querystring" target="clean"/>
<ant dir="urlnormalizer-regex" target="clean"/>
</target>
</project>
Added: nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml?rev=1498832&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml Tue Jul 2
08:37:40 2013
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlnormalizer-querystring" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
Added: nutch/trunk/src/plugin/urlnormalizer-querystring/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/ivy.xml?rev=1498832&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-querystring/ivy.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-querystring/ivy.xml Tue Jul 2
08:37:40 2013
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
Added: nutch/trunk/src/plugin/urlnormalizer-querystring/plugin.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/plugin.xml?rev=1498832&view=auto
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-querystring/plugin.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-querystring/plugin.xml Tue Jul 2
08:37:40 2013
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlnormalizer-querystring"
+ name="Querystrings URL Normalizer"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="urlnormalizer-querystring.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlnormalizer.querystring"
+ name="Nutch Querystring URL Normalizer"
+ point="org.apache.nutch.net.URLNormalizer">
+ <implementation id="QuerystringURLNormalizer"
+
class="org.apache.nutch.net.urlnormalizer.querystring.QuerystringURLNormalizer">
+ </implementation>
+ </extension>
+
+</plugin>
Added:
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java?rev=1498832&view=auto
==============================================================================
---
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
(added)
+++
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
Tue Jul 2 08:37:40 2013
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.querystring;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * URL normalizer plugin for normalizing query strings but sorting
+ * query string parameters. Not sorting query strings can lead to large
+ * amounts of duplicate URL's such as ?a=x&b=y vs b=y&a=x.
+ *
+ */
+public class QuerystringURLNormalizer implements URLNormalizer {
+
+ private Configuration conf;
+
+ private static final Logger LOG =
LoggerFactory.getLogger(QuerystringURLNormalizer.class);
+
+ public QuerystringURLNormalizer() {}
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public String normalize(String urlString, String scope) throws
MalformedURLException {
+ URL url = new URL(urlString);
+
+ String queryString = url.getQuery();
+
+ if (queryString == null) {
+ return urlString;
+ }
+
+ List<String> queryStringParts = Arrays.asList(queryString.split("&"));
+ Collections.sort(queryStringParts);
+
+ StringBuilder sb = new StringBuilder();
+
+ sb.append(url.getProtocol());
+ sb.append("://");
+ sb.append(url.getHost());
+ if (url.getPort() > -1) {
+ sb.append(":");
+ sb.append(url.getPort());
+ }
+ sb.append(url.getPath());
+ sb.append("?");
+ sb.append(StringUtils.join(queryStringParts, "&"));
+ if (url.getRef() != null) {
+ sb.append("#");
+ sb.append(url.getRef());
+ }
+
+ return sb.toString();
+ }
+}
Added:
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java?rev=1498832&view=auto
==============================================================================
---
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
(added)
+++
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
Tue Jul 2 08:37:40 2013
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.querystring;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestQuerystringURLNormalizer extends TestCase {
+
+ public void testQuerystringURLNormalizer() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+
+ QuerystringURLNormalizer normalizer = new QuerystringURLNormalizer();
+ normalizer.setConf(conf);
+
+ assertEquals("http://example.com/?a=b&c=d",
normalizer.normalize("http://example.com/?c=d&a=b",
URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.com/a/b/c",
normalizer.normalize("http://example.com/a/b/c", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.com:1234/a/b/c",
normalizer.normalize("http://example.com:1234/a/b/c",
URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.com:1234/a/b/c#ref",
normalizer.normalize("http://example.com:1234/a/b/c#ref",
URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.com:1234/a/b/c?a=b&c=d#ref",
normalizer.normalize("http://example.com:1234/a/b/c?c=d&a=b#ref",
URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.com/?a=b&a=c&c=d",
normalizer.normalize("http://example.com/?c=d&a=b&a=c",
URLNormalizers.SCOPE_DEFAULT));
+ }
+}