http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java b/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java new file mode 100644 index 0000000..1de03a3 --- /dev/null +++ b/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java @@ -0,0 +1,91 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.querystring; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizer; +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.util.URLUtil; + +/** + * URL normalizer plugin for normalizing query strings but sorting query string + * parameters. Not sorting query strings can lead to large amounts of duplicate + * URL's such as ?a=x&b=y vs b=y&a=x. + * + */ +public class QuerystringURLNormalizer implements URLNormalizer { + + private Configuration conf; + + private static final Logger LOG = LoggerFactory + .getLogger(QuerystringURLNormalizer.class); + + public QuerystringURLNormalizer() { + } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public String normalize(String urlString, String scope) + throws MalformedURLException { + URL url = new URL(urlString); + + String queryString = url.getQuery(); + + if (queryString == null) { + return urlString; + } + + List<String> queryStringParts = Arrays.asList(queryString.split("&")); + Collections.sort(queryStringParts); + + StringBuilder sb = new StringBuilder(); + + sb.append(url.getProtocol()); + sb.append("://"); + sb.append(url.getHost()); + if (url.getPort() > -1) { + sb.append(":"); + sb.append(url.getPort()); + } + sb.append(url.getPath()); + sb.append("?"); + sb.append(StringUtils.join(queryStringParts, "&")); + if (url.getRef() != null) { + sb.append("#"); + sb.append(url.getRef()); + } + + return sb.toString(); + } +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java b/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java new file mode 100644 index 0000000..005fbca --- /dev/null +++ b/nutch-plugins/urlnormalizer-querystring/src/main/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL normalizer which sort the elements in the query part to avoid duplicates + * by permutations. + */ +package org.apache.nutch.net.urlnormalizer.querystring; + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java b/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java new file mode 100644 index 0000000..b85c55d --- /dev/null +++ b/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.querystring; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestQuerystringURLNormalizer extends TestCase { + + public void testQuerystringURLNormalizer() throws Exception { + Configuration conf = NutchConfiguration.create(); + + QuerystringURLNormalizer normalizer = new QuerystringURLNormalizer(); + normalizer.setConf(conf); + + assertEquals("http://example.com/?a=b&c=d", normalizer.normalize( + "http://example.com/?c=d&a=b", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com/a/b/c", normalizer.normalize( + "http://example.com/a/b/c", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com:1234/a/b/c", normalizer.normalize( + "http://example.com:1234/a/b/c", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com:1234/a/b/c#ref", normalizer.normalize( + "http://example.com:1234/a/b/c#ref", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com:1234/a/b/c?a=b&c=d#ref", + normalizer.normalize("http://example.com:1234/a/b/c?c=d&a=b#ref", + URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.com/?a=b&a=c&c=d", normalizer.normalize( + "http://example.com/?c=d&a=b&a=c", URLNormalizers.SCOPE_DEFAULT)); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/build.xml b/nutch-plugins/urlnormalizer-regex/build.xml new file mode 100644 index 0000000..76875ec --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/build.xml @@ -0,0 +1,34 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlnormalizer-regex" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + </target> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="sample" includes="**/*.xml, **/*.test"/> + </copy> + + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/ivy.xml b/nutch-plugins/urlnormalizer-regex/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/plugin.xml b/nutch-plugins/urlnormalizer-regex/plugin.xml new file mode 100644 index 0000000..e75096f --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/plugin.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlnormalizer-regex" + name="Regex URL Normalizer" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlnormalizer-regex.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlnormalizer.regex" + name="Nutch Regex URL Normalizer" + point="org.apache.nutch.net.URLNormalizer"> + <implementation id="RegexURLNormalizer" + class="org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer"/> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/pom.xml b/nutch-plugins/urlnormalizer-regex/pom.xml new file mode 100644 index 0000000..bfb056e --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlnormalizer-regex</artifactId> + <packaging>jar</packaging> + + <name>urlnormalizer-regex</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test new file mode 100644 index 0000000..7867ad8 --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.test @@ -0,0 +1,84 @@ +# test simple removal of session id, keeping parameters before and after +http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php +http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php?f=2 +http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&q=3 http://foo.com/foo.php?f=2&q=3 +http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&f=2 http://foo.com/foo.php?f=2 + +# test removal of different session ids including removal of ; in jsessionid +http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl http://www.foo.com/foo.php +http://www.foo.com/foo.php?Bv_SessionID=fassassddsajkl&x=y http://www.foo.com/foo.php?x=y +http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://www.foo.com/foo.html +http://www.foo.com/foo.html?param=1&another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://www.foo.com/foo.html?param=1&another=2 +http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1&another=2 http://www.foo.com/foo.html?param=1&another=2 +http://www.foo.com/foo.php?&x=1&sid=xyz&something=1 http://www.foo.com/foo.php?x=1&something=1 +http://www.foo.com/foo.html?_sessionID=824A6C0A13a7e11205wxN28F44E3 http://www.foo.com/foo.html +http://www.foo.com/foo.php?_sessionid=qmyrcedt&outputformat=html&path=/3_images/foo http://www.foo.com/foo.php?outputformat=html&path=/3_images/foo +http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en&_sessionid=e36902d5bb2d0d922fc24b43 http://www.foo.com/foo.php?_pid=2&_spid=0&lang=en +http://www.foo.com/foo.php?app=content&content=overview&lang=en&_sid=587fba8f825b05844526519fdb7d75c8&b=35&m=47 http://www.foo.com/foo.php?app=content&content=overview&lang=en&b=35&m=47 +# but NewsId (and similar) is not a session id (NUTCH-706, NUTCH-1328) +http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 +http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0 http://www.foo.com/home.cfm?language=en&country=uk&addressid=250646&pagingpos=0 + +# test removal default pages +http://www.foo.com/home/index.html http://www.foo.com/home/ +http://www.foo.com/index.html http://www.foo.com/ +http://www.foo.com/index.htm http://www.foo.com/ +http://www.foo.com/index.asp http://www.foo.com/ +http://www.foo.com/index.aspx http://www.foo.com/ +http://www.foo.com/index.php http://www.foo.com/ +http://www.foo.com/index.php3 http://www.foo.com/ +http://www.foo.com/default.html http://www.foo.com/ +http://www.foo.com/default.htm http://www.foo.com/ +http://www.foo.com/default.asp http://www.foo.com/ +http://www.foo.com/default.aspx http://www.foo.com/ +http://www.foo.com/default.php http://www.foo.com/ +http://www.foo.com/default.php3 http://www.foo.com/ +http://www.foo.com/something.php3 http://www.foo.com/something.php3 +http://www.foo.com/something.html http://www.foo.com/something.html +http://www.foo.com/something.asp http://www.foo.com/something.asp +http://www.foo.com/index.phtml http://www.foo.com/ +http://www.foo.com/index.cfm http://www.foo.com/ +http://www.foo.com/index.cgi http://www.foo.com/ +http://www.foo.com/index.HTML http://www.foo.com/ +http://www.foo.com/index.Htm http://www.foo.com/ +http://www.foo.com/index.ASP http://www.foo.com/ +http://www.foo.com/index.jsp http://www.foo.com/ +http://www.foo.com/index.jsf http://www.foo.com/ +http://www.foo.com/index.jspx http://www.foo.com/ +http://www.foo.com/index.jspfx http://www.foo.com/index.jspfx +http://www.foo.com/index.jspa http://www.foo.com/ +http://www.foo.com/index.jsps http://www.foo.com/index.jsps +http://www.foo.com/index.aspX http://www.foo.com/ +http://www.foo.com/index.PhP http://www.foo.com/ +http://www.foo.com/index.PhP4 http://www.foo.com/ +http://www.foo.com/default.HTml http://www.foo.com/ +http://www.foo.com/default.HTm http://www.foo.com/ +http://www.foo.com/default.ASp http://www.foo.com/ +http://www.foo.com/default.AspX http://www.foo.com/ +http://www.foo.com/default.PHP http://www.foo.com/ +http://www.foo.com/default.PHP3 http://www.foo.com/ +http://www.foo.com/index.phtml http://www.foo.com/ +http://www.foo.com/index.cfm http://www.foo.com/ +http://www.foo.com/index.cgi http://www.foo.com/ + +# ensure keeping non-default pages +http://www.foo.com/foo.php3 http://www.foo.com/foo.php3 +http://www.foo.com/foo.html http://www.foo.com/foo.html +http://www.foo.com/foo.asp http://www.foo.com/foo.asp + +# test removal of interpage anchors and keeping query string +http://www.foo.com/foo.html#something http://www.foo.com/foo.html +http://www.foo.com/foo.html#something?x=y http://www.foo.com/foo.html?x=y + +# test general cleaning of bad urls +http://www.foo.com/foo.html?&x=y http://www.foo.com/foo.html?x=y +http://www.foo.com/foo.html?x=y&&&z=a http://www.foo.com/foo.html?x=y&z=a +http://www.foo.com/foo.html? http://www.foo.com/foo.html + +# remove double slashes but keep 2 slashes after protocol +http://www.foo.com//path//foo.html http://www.foo.com/path/foo.html +https://www.foo.com//path//foo.html https://www.foo.com/path/foo.html + +# normalize file: protocol prefix (keep one slash) +file:///path//foo.html file:/path/foo.html +file:/path//foo.html file:/path/foo.html http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml new file mode 100644 index 0000000..4d6eabc --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-default.xml @@ -0,0 +1,66 @@ +<?xml version="1.0"?> +<!-- This is the configuration file for the RegexUrlNormalize Class. + This is intended so that users can specify substitutions to be + done on URLs. The regex engine that is used is Perl5 compatible. + The rules are applied to URLs in the order they occur in this file. --> + +<!-- WATCH OUT: an xml parser reads this file an ampersands must be + expanded to & --> + +<!-- The following rules show how to strip out session IDs, default pages, + interpage anchors, etc. Order does matter! --> +<regex-normalize> + +<!-- removes session ids from urls (such as jsessionid and PHPSESSID) --> +<regex> + <pattern>(?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|&|#|$)</pattern> + <substitution>$4</substitution> +</regex> + +<!-- changes default pages into standard for /index.html, etc. into / --> +<!-- these are commented in the default file but uncommented here for testing --> +<regex> + <pattern>/((?i)index|default)\.((?i)js[pf]{1}?[afx]?|cgi|cfm|asp[x]?|[psx]?htm[l]?|php[3456]?)(\?|&|#|$)</pattern> + <substitution>/$3</substitution> +</regex> + +<!-- removes interpage href anchors such as site.com#location --> +<regex> + <pattern>#.*?(\?|&|$)</pattern> + <substitution>$1</substitution> +</regex> + +<!-- cleans ?&var=value into ?var=value --> +<regex> + <pattern>\?&</pattern> + <substitution>\?</substitution> +</regex> + +<!-- cleans multiple sequential ampersands into a single ampersand --> +<regex> + <pattern>&{2,}</pattern> + <substitution>&</substitution> +</regex> + +<!-- removes trailing ?, ampersands, . --> +<regex> + <pattern>[\?&\.]$</pattern> + <substitution></substitution> +</regex> + +<!-- normalize file:/// protocol prefix: --> +<!-- keep one single slash (NUTCH-1483) --> +<regex> + <pattern>^file://+</pattern> + <substitution>file:/</substitution> +</regex> + +<!-- removes duplicate slashes but --> +<!-- * allow 2 slashes after colon ':' (indicating protocol) --> +<regex> + <pattern>(?<!:)/{2,}</pattern> + <substitution>/</substitution> +</regex> + +</regex-normalize> + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test new file mode 100644 index 0000000..9d92880 --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.test @@ -0,0 +1,8 @@ +# test removal of subdomains +http://www.foo.bar.com/ http://bar.com/ + +# test removal of url path +http://www.foo.bar.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://bar.com/ + +# test removal of urls in arguments +https://www.foo.bar.com/foo.php?url=http://www.example.com/test.php https://bar.com/ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml new file mode 100644 index 0000000..3698968 --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/sample/regex-normalize-scope1.xml @@ -0,0 +1,21 @@ +<?xml version="1.0"?> +<!-- This is the configuration file for the RegexUrlNormalize Class. + This is intended so that users can specify substitutions to be + done on URLs. The regex engine that is used is Perl5 compatible. + The rules are applied to URLs in the order they occur in this file. --> + +<!-- WATCH OUT: an xml parser reads this file an ampersands must be + expanded to & --> + +<!-- + The following rules show how to reduce urls so that + urls from the same domain are identical. This is useful + e.g. when calculating host counts, or splitting fetchlists. +--> +<regex-normalize> +<regex> + <pattern>(^[a-z]{3,5}://)([\w]+\.)*?(\w+\.\w+)[/$].*</pattern> + <substitution>$1$3/</substitution> +</regex> +</regex-normalize> + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java b/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java new file mode 100644 index 0000000..363da18 --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java @@ -0,0 +1,324 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.regex; + +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import javax.xml.parsers.DocumentBuilderFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.nutch.net.URLNormalizer; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.w3c.dom.Text; +import org.xml.sax.InputSource; + +/** + * Allows users to do regex substitutions on all/any URLs that are encountered, + * which is useful for stripping session IDs from URLs. + * + * <p> + * This class uses the <tt>urlnormalizer.regex.file</tt> property. It should be + * set to the file name of an xml file which should contain the patterns and + * substitutions to be done on encountered URLs. + * </p> + * <p> + * This class also supports different rules depending on the scope. Please see + * the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more details. + * </p> + * + * @author Luke Baker + * @author Andrzej Bialecki + */ +public class RegexURLNormalizer extends Configured implements URLNormalizer { + + private static final Logger LOG = LoggerFactory + .getLogger(RegexURLNormalizer.class); + + /** + * Class which holds a compiled pattern and its corresponding substition + * string. + */ + private static class Rule { + public Pattern pattern; + + public String substitution; + } + + private ThreadLocal<HashMap<String, List<Rule>>> scopedRulesThreadLocal = new ThreadLocal<HashMap<String, List<Rule>>>() { + protected java.util.HashMap<String, java.util.List<Rule>> initialValue() { + return new HashMap<String, List<Rule>>(); + }; + }; + + public HashMap<String, List<Rule>> getScopedRules() { + return scopedRulesThreadLocal.get(); + } + + private List<Rule> defaultRules; + + private static final List<Rule> EMPTY_RULES = Collections.emptyList(); + + /** + * The default constructor which is called from UrlNormalizerFactory + * (normalizerClass.newInstance()) in method: getNormalizer()* + */ + public RegexURLNormalizer() { + super(null); + } + + public RegexURLNormalizer(Configuration conf) { + super(conf); + } + + /** + * Constructor which can be passed the file name, so it doesn't look in the + * configuration files for it. + */ + public RegexURLNormalizer(Configuration conf, String filename) + throws IOException, PatternSyntaxException { + super(conf); + List<Rule> rules = readConfigurationFile(filename); + if (rules != null) { + defaultRules = rules; + } + } + + public void setConf(Configuration conf) { + super.setConf(conf); + if (conf == null) + return; + // the default constructor was called + + String filename = getConf().get("urlnormalizer.regex.file"); + String stringRules = getConf().get("urlnormalizer.regex.rules"); + Reader reader = null; + if (stringRules != null) { + reader = new StringReader(stringRules); + } else { + reader = getConf().getConfResourceAsReader(filename); + } + List<Rule> rules = null; + if (reader == null) { + LOG.warn("Can't load the default rules! "); + rules = EMPTY_RULES; + } else { + try { + rules = readConfiguration(reader); + } catch (Exception e) { + LOG.warn("Couldn't read default config: " + e); + rules = EMPTY_RULES; + } + } + defaultRules = rules; + } + + // used in JUnit test. + void setConfiguration(Reader reader, String scope) { + List<Rule> rules = readConfiguration(reader); + getScopedRules().put(scope, rules); + LOG.debug("Set config for scope '" + scope + "': " + rules.size() + + " rules."); + } + + /** + * This function does the replacements by iterating through all the regex + * patterns. It accepts a string url as input and returns the altered string. + */ + public String regexNormalize(String urlString, String scope) { + HashMap<String, List<Rule>> scopedRules = getScopedRules(); + List<Rule> curRules = scopedRules.get(scope); + if (curRules == null) { + // try to populate + String configFile = getConf().get("urlnormalizer.regex.file." + scope); + if (configFile != null) { + LOG.debug("resource for scope '" + scope + "': " + configFile); + try { + Reader reader = getConf().getConfResourceAsReader(configFile); + curRules = readConfiguration(reader); + scopedRules.put(scope, curRules); + } catch (Exception e) { + LOG.warn("Couldn't load resource '" + configFile + "': " + e); + } + } + if (curRules == EMPTY_RULES || curRules == null) { + LOG.info("can't find rules for scope '" + scope + "', using default"); + scopedRules.put(scope, EMPTY_RULES); + } + } + if (curRules == EMPTY_RULES || curRules == null) { + curRules = defaultRules; + } + Iterator<Rule> i = curRules.iterator(); + while (i.hasNext()) { + Rule r = (Rule) i.next(); + + Matcher matcher = r.pattern.matcher(urlString); + + urlString = matcher.replaceAll(r.substitution); + } + return urlString; + } + + public String normalize(String urlString, String scope) + throws MalformedURLException { + return regexNormalize(urlString, scope); + } + + /** Reads the configuration file and populates a List of Rules. */ + private List<Rule> readConfigurationFile(String filename) { + if (LOG.isInfoEnabled()) { + LOG.info("loading " + filename); + } + try { + FileReader reader = new FileReader(filename); + return readConfiguration(reader); + } catch (Exception e) { + LOG.error("Error loading rules from '" + filename + "': " + e); + return EMPTY_RULES; + } + } + + private List<Rule> readConfiguration(Reader reader) { + List<Rule> rules = new ArrayList<Rule>(); + try { + + // borrowed heavily from code in Configuration.java + Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() + .parse(new InputSource(reader)); + Element root = doc.getDocumentElement(); + if ((!"regex-normalize".equals(root.getTagName())) + && (LOG.isErrorEnabled())) { + LOG.error("bad conf file: top-level element not <regex-normalize>"); + } + NodeList regexes = root.getChildNodes(); + for (int i = 0; i < regexes.getLength(); i++) { + Node regexNode = regexes.item(i); + if (!(regexNode instanceof Element)) + continue; + Element regex = (Element) regexNode; + if ((!"regex".equals(regex.getTagName())) && (LOG.isWarnEnabled())) { + LOG.warn("bad conf file: element not <regex>"); + } + NodeList fields = regex.getChildNodes(); + String patternValue = null; + String subValue = null; + for (int j = 0; j < fields.getLength(); j++) { + Node fieldNode = fields.item(j); + if (!(fieldNode instanceof Element)) + continue; + Element field = (Element) fieldNode; + if ("pattern".equals(field.getTagName()) && field.hasChildNodes()) + patternValue = ((Text) field.getFirstChild()).getData(); + if ("substitution".equals(field.getTagName()) + && field.hasChildNodes()) + subValue = ((Text) field.getFirstChild()).getData(); + if (!field.hasChildNodes()) + subValue = ""; + } + if (patternValue != null && subValue != null) { + Rule rule = new Rule(); + try { + rule.pattern = Pattern.compile(patternValue); + } catch (PatternSyntaxException e) { + if (LOG.isErrorEnabled()) { + LOG.error("skipped rule: " + patternValue + " -> " + subValue + + " : invalid regular expression pattern: " + e); + } + continue; + } + rule.substitution = subValue; + rules.add(rule); + } + } + } catch (Exception e) { + if (LOG.isErrorEnabled()) { + LOG.error("error parsing conf file: " + e); + } + return EMPTY_RULES; + } + if (rules.size() == 0) + return EMPTY_RULES; + return rules; + } + + /** Spits out patterns and substitutions that are in the configuration file. */ + public static void main(String args[]) throws PatternSyntaxException, + IOException { + RegexURLNormalizer normalizer = new RegexURLNormalizer(); + normalizer.setConf(NutchConfiguration.create()); + HashMap<String, List<Rule>> scopedRules = normalizer.getScopedRules(); + Iterator<Rule> i = normalizer.defaultRules.iterator(); + System.out.println("* Rules for 'DEFAULT' scope:"); + while (i.hasNext()) { + Rule r = i.next(); + System.out.print(" " + r.pattern.pattern() + " -> "); + System.out.println(r.substitution); + } + // load the scope + if (args.length > 1) { + normalizer.normalize("http://test.com", args[1]); + } + if (scopedRules.size() > 1) { + Iterator<String> it = scopedRules.keySet().iterator(); + while (it.hasNext()) { + String scope = it.next(); + if (URLNormalizers.SCOPE_DEFAULT.equals(scope)) + continue; + System.out.println("* Rules for '" + scope + "' scope:"); + i = ((List<Rule>) scopedRules.get(scope)).iterator(); + while (i.hasNext()) { + Rule r = (Rule) i.next(); + System.out.print(" " + r.pattern.pattern() + " -> "); + System.out.println(r.substitution); + } + } + } + if (args.length > 0) { + System.out.println("\n---------- Normalizer test -----------"); + String scope = URLNormalizers.SCOPE_DEFAULT; + if (args.length > 1) + scope = args[1]; + System.out.println("Scope: " + scope); + System.out.println("Input url: '" + args[0] + "'"); + System.out.println("Output url: '" + normalizer.normalize(args[0], scope) + + "'"); + } + System.exit(0); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java b/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java new file mode 100644 index 0000000..04562c3 --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/src/main/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL normalizer with configurable rules based on regular expressions + * ({@link java.util.regex.Pattern}). + */ +package org.apache.nutch.net.urlnormalizer.regex; + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java b/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java new file mode 100644 index 0000000..cbf6c64 --- /dev/null +++ b/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java @@ -0,0 +1,186 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.regex; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileFilter; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.*; + +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +/** Unit tests for RegexUrlNormalizer. */ +public class TestRegexURLNormalizer { + private static final Logger LOG = LoggerFactory + .getLogger(TestRegexURLNormalizer.class); + + private RegexURLNormalizer normalizer; + private Configuration conf; + private Map<String, NormalizedURL[]> testData = new HashMap<String, NormalizedURL[]>(); + + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/urlnormalizer-regex/build.xml during plugin compilation. + + public TestRegexURLNormalizer() throws IOException { + normalizer = new RegexURLNormalizer(); + conf = NutchConfiguration.create(); + normalizer.setConf(conf); + File[] configs = new File(sampleDir).listFiles(new FileFilter() { + public boolean accept(File f) { + if (f.getName().endsWith(".xml") + && f.getName().startsWith("regex-normalize-")) + return true; + return false; + } + }); + for (int i = 0; i < configs.length; i++) { + try { + FileReader reader = new FileReader(configs[i]); + String cname = configs[i].getName(); + cname = cname.substring(16, cname.indexOf(".xml")); + normalizer.setConfiguration(reader, cname); + NormalizedURL[] urls = readTestFile(cname); + testData.put(cname, urls); + } catch (Exception e) { + LOG.warn("Could load config from '" + configs[i] + "': " + e.toString()); + } + } + } + + @Test + public void testNormalizerDefault() throws Exception { + normalizeTest((NormalizedURL[]) testData.get(URLNormalizers.SCOPE_DEFAULT), + URLNormalizers.SCOPE_DEFAULT); + } + + @Test + public void testNormalizerScope() throws Exception { + Iterator<String> it = testData.keySet().iterator(); + while (it.hasNext()) { + String scope = it.next(); + normalizeTest((NormalizedURL[]) testData.get(scope), scope); + } + } + + private void normalizeTest(NormalizedURL[] urls, String scope) + throws Exception { + for (int i = 0; i < urls.length; i++) { + String url = urls[i].url; + String normalized = normalizer.normalize(urls[i].url, scope); + String expected = urls[i].expectedURL; + LOG.info("scope: " + scope + " url: " + url + " | normalized: " + + normalized + " | expected: " + expected); + Assert.assertEquals(urls[i].expectedURL, normalized); + } + } + + private void bench(int loops, String scope) { + long start = System.currentTimeMillis(); + try { + NormalizedURL[] expected = (NormalizedURL[]) testData.get(scope); + if (expected == null) + return; + for (int i = 0; i < loops; i++) { + normalizeTest(expected, scope); + } + } catch (Exception e) { + Assert.fail(e.toString()); + } + LOG.info("bench time (" + loops + ") " + + (System.currentTimeMillis() - start) + "ms"); + } + + private static class NormalizedURL { + String url; + String expectedURL; + + public NormalizedURL(String line) { + String[] fields = line.split("\\s+"); + url = fields[0]; + expectedURL = fields[1]; + } + } + + private NormalizedURL[] readTestFile(String scope) throws IOException { + File f = new File(sampleDir, "regex-normalize-" + scope + ".test"); + @SuppressWarnings("resource") + BufferedReader in = new BufferedReader(new InputStreamReader( + new FileInputStream(f), "UTF-8")); + List<NormalizedURL> list = new ArrayList<NormalizedURL>(); + String line; + while ((line = in.readLine()) != null) { + if (line.trim().length() == 0 || line.startsWith("#") + || line.startsWith(" ")) + continue; + list.add(new NormalizedURL(line)); + } + return (NormalizedURL[]) list.toArray(new NormalizedURL[list.size()]); + } + + public static void main(String[] args) throws Exception { + if (args.length == 0) { + System.err.println("TestRegexURLNormalizer [-bench <iter>] <scope>"); + System.exit(-1); + } + boolean bench = false; + int iter = -1; + String scope = null; + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-bench")) { + bench = true; + iter = Integer.parseInt(args[++i]); + } else + scope = args[i]; + } + if (scope == null) { + System.err.println("Missing required scope name."); + System.exit(-1); + } + if (bench && iter < 0) { + System.err.println("Invalid number of iterations: " + iter); + System.exit(-1); + } + TestRegexURLNormalizer test = new TestRegexURLNormalizer(); + NormalizedURL[] urls = (NormalizedURL[]) test.testData.get(scope); + if (urls == null) { + LOG.warn("Missing test data for scope '" + scope + + "', using default scope."); + scope = URLNormalizers.SCOPE_DEFAULT; + urls = (NormalizedURL[]) test.testData.get(scope); + } + if (bench) { + test.bench(iter, scope); + } else { + test.normalizeTest(urls, scope); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-slash/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-slash/build.xml b/nutch-plugins/urlnormalizer-slash/build.xml new file mode 100644 index 0000000..29b2262 --- /dev/null +++ b/nutch-plugins/urlnormalizer-slash/build.xml @@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="urlnormalizer-slash" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="data" /> + </copy> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-slash/data/slashes.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-slash/data/slashes.txt b/nutch-plugins/urlnormalizer-slash/data/slashes.txt new file mode 100644 index 0000000..d3bd70a --- /dev/null +++ b/nutch-plugins/urlnormalizer-slash/data/slashes.txt @@ -0,0 +1,7 @@ +# Both domains have duplicate URL's, some with slashes and some without + +# We prefer this domain with slashes +www.example.org + + +# ..but this domain without +www.example.net - \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-slash/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-slash/ivy.xml b/nutch-plugins/urlnormalizer-slash/ivy.xml new file mode 100644 index 0000000..0a363f7 --- /dev/null +++ b/nutch-plugins/urlnormalizer-slash/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="${nutch.root}/ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-slash/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-slash/plugin.xml b/nutch-plugins/urlnormalizer-slash/plugin.xml new file mode 100644 index 0000000..db820ed --- /dev/null +++ b/nutch-plugins/urlnormalizer-slash/plugin.xml @@ -0,0 +1,43 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="urlnormalizer-slash" + name="Slash URL Normalizer" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="urlnormalizer-slash.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.net.urlnormalizer.slash" + name="Nutch Slash URL Normalizer" + point="org.apache.nutch.net.URLNormalizer"> + <implementation id="SlashURLNormalizer" + class="org.apache.nutch.net.urlnormalizer.slash.SlashURLNormalizer"> + <parameter name="file" value="slashes.txt"/> + </implementation> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-slash/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-slash/pom.xml b/nutch-plugins/urlnormalizer-slash/pom.xml new file mode 100644 index 0000000..0ac618f --- /dev/null +++ b/nutch-plugins/urlnormalizer-slash/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>urlnormalizer-slash</artifactId> + <packaging>jar</packaging> + + <name>urlnormalizer-slash</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-slash/src/main/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-slash/src/main/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java b/nutch-plugins/urlnormalizer-slash/src/main/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java new file mode 100644 index 0000000..4554cf0 --- /dev/null +++ b/nutch-plugins/urlnormalizer-slash/src/main/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java @@ -0,0 +1,224 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.slash; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.URLNormalizer; +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.util.URLUtil; + +/** + * @author [email protected] + */ +public class SlashURLNormalizer implements URLNormalizer { + + private Configuration conf; + + private static final Logger LOG = LoggerFactory.getLogger(SlashURLNormalizer.class); + + private static final char QUESTION_MARK = '?'; + private static final char SLASH = '/'; + private static final char DOT = '.'; + private static final String PROTOCOL_DELIMITER = "://"; + + private static String attributeFile = null; + private String slashesFile = null; + + // We record a map of hosts and boolean, the boolean denotes whether the host should + // have slashes after URL paths. True means slash, false means remove the slash + private static final Map<String,Boolean> slashesMap = new HashMap<String,Boolean>(); + + public SlashURLNormalizer() {} + + public SlashURLNormalizer(String slashesFile) { + this.slashesFile = slashesFile; + } + + private synchronized void readConfiguration(Reader configReader) throws IOException { + if (slashesMap.size() > 0) { + return; + } + + BufferedReader reader = new BufferedReader(configReader); + String line, host; + String rule; + int delimiterIndex; + + while ((line = reader.readLine()) != null) { + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { + line.trim(); + delimiterIndex = line.indexOf(" "); + // try tabulator + if (delimiterIndex == -1) { + delimiterIndex = line.indexOf("\t"); + } + + host = line.substring(0, delimiterIndex); + rule = line.substring(delimiterIndex + 1).trim(); + + if (rule.equals("+")) { + slashesMap.put(host, true); + } else { + slashesMap.put(host, false); + } + } + } + } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + + // get the extensions for domain urlfilter + String pluginName = "urlnormalizer-slash"; + Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( + URLNormalizer.class.getName()).getExtensions(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + if (extension.getDescriptor().getPluginId().equals(pluginName)) { + attributeFile = extension.getAttribute("file"); + break; + } + } + + // handle blank non empty input + if (attributeFile != null && attributeFile.trim().equals("")) { + attributeFile = null; + } + + if (attributeFile != null) { + if (LOG.isInfoEnabled()) { + LOG.info("Attribute \"file\" is defined for plugin " + pluginName + + " as " + attributeFile); + } + } + else { + if (LOG.isWarnEnabled()) { + LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " + + pluginName); + } + } + + // domain file and attribute "file" take precedence if defined + String file = conf.get("urlnormalizer.slashes.file"); + String stringRules = conf.get("urlnormalizer.slashes.rules"); + if (slashesFile != null) { + file = slashesFile; + } + else if (attributeFile != null) { + file = attributeFile; + } + Reader reader = null; + if (stringRules != null) { // takes precedence over files + reader = new StringReader(stringRules); + } else { + reader = conf.getConfResourceAsReader(file); + } + try { + if (reader == null) { + reader = new FileReader(file); + } + readConfiguration(reader); + } + catch (IOException e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } + + public String normalize(String url, String scope) throws MalformedURLException { + return normalize(url, null, scope); + } + + public String normalize(String url, CrawlDatum crawlDatum, String scope) throws MalformedURLException { + // Get URL repr. + URL u = new URL(url); + + // Get the host + String host = u.getHost(); + + // Do we have a rule for this host? + if (slashesMap.containsKey(host)) { + // Yes, separate the path and optional querystring + String protocol = u.getProtocol(); + String path = u.getPath(); + + // Don't do anything to root URL's + // / is always set by basic normalizer + if (path.length() > 1) { + String queryString = u.getQuery(); + + // Get the rule + boolean rule = slashesMap.get(host); + + // Does it have a trailing slash + int lastIndexOfSlash = path.lastIndexOf(SLASH); + boolean trailingSlash = (lastIndexOfSlash == path.length() - 1); + + // Do we need to add a trailing slash? + if (!trailingSlash && rule) { + // Only add a trailing slash if this path doesn't appear to have an extension/suffix such as .html + int lastIndexOfDot = path.lastIndexOf(DOT); + if (path.length() < 6 || lastIndexOfDot == -1 || lastIndexOfDot < path.length() - 6) { + StringBuilder buffer = new StringBuilder(protocol); + buffer.append(PROTOCOL_DELIMITER); + buffer.append(host); + buffer.append(path); + buffer.append(SLASH); + if (queryString != null) { + buffer.append(QUESTION_MARK); + buffer.append(queryString); + } + url = buffer.toString(); + } + } + + // Do we need to remove a trailing slash? + else if (trailingSlash && !rule) { + StringBuilder buffer = new StringBuilder(protocol); + buffer.append(PROTOCOL_DELIMITER); + buffer.append(host); + buffer.append(path.substring(0, lastIndexOfSlash)); + if (queryString != null) { + buffer.append(QUESTION_MARK); + buffer.append(queryString); + } + url = buffer.toString(); + } + } + } + + return url; + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java b/nutch-plugins/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java new file mode 100644 index 0000000..c3585e4 --- /dev/null +++ b/nutch-plugins/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.slash; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestSlashURLNormalizer extends TestCase { + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + public void testSlashURLNormalizer() throws Exception { + Configuration conf = NutchConfiguration.create(); + + String slashesFile = SAMPLES + SEPARATOR + "slashes.txt"; + SlashURLNormalizer normalizer = new SlashURLNormalizer(slashesFile); + normalizer.setConf(conf); + + // No change + assertEquals("http://example.org/", normalizer.normalize("http://example.org/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.net/", normalizer.normalize("http://example.net/", URLNormalizers.SCOPE_DEFAULT)); + + // Don't touch base URL's + assertEquals("http://example.org", normalizer.normalize("http://example.org", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.net", normalizer.normalize("http://example.net", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.org/", normalizer.normalize("http://example.org/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.net/", normalizer.normalize("http://example.net/", URLNormalizers.SCOPE_DEFAULT)); + + // Change + assertEquals("http://www.example.org/page/", normalizer.normalize("http://www.example.org/page", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://www.example.net/path/to/something", normalizer.normalize("http://www.example.net/path/to/something/", URLNormalizers.SCOPE_DEFAULT)); + + // No change + assertEquals("http://example.org/buh/", normalizer.normalize("http://example.org/buh/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.net/blaat", normalizer.normalize("http://example.net/blaat", URLNormalizers.SCOPE_DEFAULT)); + + // No change + assertEquals("http://example.nl/buh/", normalizer.normalize("http://example.nl/buh/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://example.de/blaat", normalizer.normalize("http://example.de/blaat", URLNormalizers.SCOPE_DEFAULT)); + + // Change + assertEquals("http://www.example.org/page/?a=b&c=d", normalizer.normalize("http://www.example.org/page?a=b&c=d", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://www.example.net/path/to/something?a=b&c=d", normalizer.normalize("http://www.example.net/path/to/something/?a=b&c=d", URLNormalizers.SCOPE_DEFAULT)); + + // No change + assertEquals("http://www.example.org/noise.mp3", normalizer.normalize("http://www.example.org/noise.mp3", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://www.example.org/page.html", normalizer.normalize("http://www.example.org/page.html", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://www.example.org/page.shtml", normalizer.normalize("http://www.example.org/page.shtml", URLNormalizers.SCOPE_DEFAULT)); + + // Change + assertEquals("http://www.example.org/this.is.not.an_extension/", normalizer.normalize("http://www.example.org/this.is.not.an_extension", URLNormalizers.SCOPE_DEFAULT)); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index b92375c..8cffbc2 100644 --- a/pom.xml +++ b/pom.xml @@ -10,7 +10,7 @@ <packaging>pom</packaging> <properties> - + <junit.version>4.12</junit.version> </properties> <modules> <module>nutch-core</module> @@ -26,6 +26,26 @@ <target>1.7</target> </configuration> </plugin> + <plugin> + <artifactId>maven-clean-plugin</artifactId> + <version>3.0.0</version> + <configuration> + <filesets> + <fileset> + <directory>runtime/</directory> + <followSymlinks>false</followSymlinks> + </fileset> + </filesets> + </configuration> + </plugin> </plugins> </build> + <dependencies> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>${junit.version}</version> + <scope>test</scope> + </dependency> + </dependencies> </project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/build-plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml deleted file mode 100755 index c759d5f..0000000 --- a/src/plugin/build-plugin.xml +++ /dev/null @@ -1,255 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<!-- Imported by plugin build.xml files to define default targets. --> -<project xmlns:ivy="antlib:org.apache.ivy.ant"> - - <property name="name" value="${ant.project.name}"/> - <property name="root" value="${basedir}"/> - - <!-- load plugin-specific properties first --> - <property file="${user.home}/${name}.build.properties" /> - <property file="${root}/build.properties" /> - - <property name="nutch.root" location="${root}/../../../"/> - - <property name="src.dir" location="${root}/src/java"/> - <property name="src.test" location="${root}/src/test"/> - - <available file="${src.test}" type="dir" property="test.available"/> - - <property name="conf.dir" location="${nutch.root}/conf"/> - - <property name="build.dir" location="${nutch.root}/build/${name}"/> - <property name="build.classes" location="${build.dir}/classes"/> - <property name="build.test" location="${build.dir}/test"/> - <property name="build.test.lib" location="${build.test}/lib"/> - - <property name="deploy.dir" location="${nutch.root}/build/plugins/${name}"/> - - <!-- load nutch defaults last so that they can be overridden above --> - <property file="${nutch.root}/default.properties" /> - - <ivy:settings id="ivy.instance" file="${nutch.root}/ivy/ivysettings.xml" /> - - <path id="plugin.deps"/> - - <fileset id="lib.jars" dir="${root}" includes="lib/*.jar"/> - - <!-- the normal classpath --> - <path id="classpath"> - <pathelement location="${build.classes}"/> - <fileset refid="lib.jars"/> - <pathelement location="${nutch.root}/build/classes"/> - <fileset dir="${nutch.root}/build/lib"> - <include name="*.jar" /> - </fileset> - <path refid="plugin.deps"/> - <fileset dir="${deploy.dir}"> - <include name="*.jar" /> - </fileset> - </path> - - <!-- the unit test classpath --> - <path id="test.classpath"> - <pathelement location="${build.test}" /> - <pathelement location="${nutch.root}/build/test/classes"/> - <pathelement location="${nutch.root}/src/test"/> - <pathelement location="${conf.dir}"/> - <pathelement location="${nutch.root}/build"/> - <!-- test dependencies specific to current plugin --> - <fileset dir="${build.test.lib}"> - <include name="*.jar" /> - </fileset> - <!-- global test dependencies --> - <fileset dir="${nutch.root}/build/test/lib"> - <include name="*.jar" /> - </fileset> - <path refid="classpath"/> - </path> - - <!-- ====================================================== --> - <!-- Stuff needed by all targets --> - <!-- ====================================================== --> - <target name="init"> - <mkdir dir="${build.dir}"/> - <mkdir dir="${build.classes}"/> - <mkdir dir="${build.test}"/> - <mkdir dir="${build.test.lib}"/> - <mkdir dir="${deploy.dir}"/> - - <antcall target="init-plugin"/> - </target> - - <!-- to be overridden by sub-projects --> - <target name="init-plugin"/> - - <!-- - ! Used to build plugin compilation dependencies - ! (to be overridden by plugins) - !--> - <target name="deps-jar"/> - - <!-- - ! Used to deploy plugin runtime dependencies - ! (to be overridden by plugins) - !--> - <target name="deps-test"/> - - <!-- - ! Used to compile test for plugin runtime dependencies - ! (to be overridden by plugins) - !--> - <target name="deps-test-compile"/> - - <!-- ====================================================== --> - <!-- Compile the Java files --> - <!-- ====================================================== --> - <target name="compile" depends="init,deps-jar, resolve-default"> - <echo message="Compiling plugin: ${name}"/> - <javac - encoding="${build.encoding}" - srcdir="${src.dir}" - includes="**/*.java" - destdir="${build.classes}" - debug="${javac.debug}" - optimize="${javac.optimize}" - target="${javac.version}" - source="${javac.version}" - deprecation="${javac.deprecation}"> - <classpath refid="classpath"/> - </javac> - </target> - - <target name="compile-core"> - <ant target="compile-core" inheritall="false" dir="${nutch.root}"/> - <ant target="compile"/> - </target> - - <!-- ================================================================== --> - <!-- Make plugin .jar --> - <!-- ================================================================== --> - <!-- --> - <!-- ================================================================== --> - <target name="jar" depends="compile"> - <jar - jarfile="${build.dir}/${name}.jar" - basedir="${build.classes}" - /> - </target> - - <target name="jar-core" depends="compile-core"> - <jar - jarfile="${build.dir}/${name}.jar" - basedir="${build.classes}" - /> - </target> - - <!-- ================================================================== --> - <!-- Deploy plugin to ${deploy.dir} --> - <!-- ================================================================== --> - <!-- --> - <!-- ================================================================== --> - <target name="deploy" depends="jar, deps-test"> - <mkdir dir="${deploy.dir}"/> - <copy file="plugin.xml" todir="${deploy.dir}" - preservelastmodified="true"/> - <available property="lib-available" - file="${build.dir}/${name}.jar"/> - <antcall target="copy-generated-lib"/> - <copy todir="${deploy.dir}" flatten="true"> - <fileset refid="lib.jars"/> - </copy> - </target> - - <target name="copy-generated-lib" if="lib-available"> - <copy file="${build.dir}/${name}.jar" todir="${deploy.dir}" failonerror="false"/> - </target> - - <!-- ================================================================== --> - <!-- Compile test code --> - <!-- ================================================================== --> - <target name="compile-test" depends="compile, deps-test-compile" if="test.available"> - <javac - encoding="${build.encoding}" - srcdir="${src.test}" - includes="**/*.java" - destdir="${build.test}" - debug="${javac.debug}" - optimize="${javac.optimize}" - target="${javac.version}" - source="${javac.version}" - deprecation="${javac.deprecation}"> - <classpath refid="test.classpath"/> - </javac> - </target> - - <!-- ================================================================== --> - <!-- Run unit tests --> - <!-- ================================================================== --> - <target name="test" depends="compile-test, deploy" if="test.available"> - <echo message="Testing plugin: ${name}"/> - - <junit printsummary="yes" haltonfailure="no" fork="yes" - errorProperty="tests.failed" failureProperty="tests.failed"> - <sysproperty key="test.data" value="${build.test}/data"/> - <sysproperty key="test.input" value="${root}/data"/> - <sysproperty key="javax.xml.parsers.DocumentBuilderFactory" value="com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"/> - <classpath refid="test.classpath"/> - <formatter type="${test.junit.output.format}" /> - <batchtest todir="${build.test}" unless="testcase"> - <fileset dir="${src.test}" - includes="**/Test*.java" excludes="**/${test.exclude}.java" /> - </batchtest> - <batchtest todir="${build.test}" if="testcase"> - <fileset dir="${src.test}" includes="**/${testcase}.java"/> - </batchtest> - </junit> - - <fail if="tests.failed">Tests failed!</fail> - - </target> - - <!-- target: resolve ================================================= --> - <target name="resolve-default" depends="clean-lib" description="resolve and retrieve dependencies with ivy"> - <ivy:resolve file="ivy.xml" conf="default" log="download-only"/> - <ivy:retrieve pattern="${deploy.dir}/[artifact]-[revision].[ext]" symlink="false" log="quiet"/> - </target> - - <target name="resolve-test" depends="clean-lib" description="resolve and retrieve dependencies with ivy"> - <ivy:resolve file="ivy.xml" conf="test" log="download-only"/> - <ivy:retrieve pattern="${build.test.lib}/[artifact]-[revision].[ext]" symlink="false" log="quiet"/> - </target> - - <!-- ================================================================== --> - <!-- Clean. Delete the build files, and their directories --> - <!-- ================================================================== --> - <!-- target: clean =================================================== --> - <target name="clean" depends="clean-build, clean-lib" description="--> clean the project" /> - - <!-- target: clean-lib =============================================== --> - <target name="clean-lib" description="--> clean the project libraries directory (dependencies)"> - <delete includeemptydirs="true" dir="${build.lib.dir}"/> - </target> - - <!-- target: clean-build ============================================= --> - <target name="clean-build" description="--> clean the project built files"> - <delete includeemptydirs="true" dir="${build.dir}"/> - <delete includeemptydirs="true" dir="${deploy.dir}"/> - </target> - -</project>
