http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java b/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java new file mode 100644 index 0000000..08a42f3 --- /dev/null +++ b/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.anchor; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlink; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +/** + * JUnit test case which tests 1. that anchor text is obtained 2. that anchor + * deduplication functionality is working + * + * @author lewismc + * + */ +public class TestAnchorIndexingFilter { + + @Test + public void testDeduplicateAnchor() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.setBoolean("anchorIndexingFilter.deduplicate", true); + AnchorIndexingFilter filter = new AnchorIndexingFilter(); + filter.setConf(conf); + Assert.assertNotNull(filter); + NutchDocument doc = new NutchDocument(); + ParseImpl parse = new ParseImpl("foo bar", new ParseData()); + Inlinks inlinks = new Inlinks(); + inlinks.add(new Inlink("http://test1.com/", "text1")); + inlinks.add(new Inlink("http://test2.com/", "text2")); + inlinks.add(new Inlink("http://test3.com/", "text2")); + try { + filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), + new CrawlDatum(), inlinks); + } catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.getMessage()); + } + Assert.assertNotNull(doc); + Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames() + .contains("anchor")); + Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor") + .getValues().size()); + } + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-basic/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-basic/build.xml b/nutch-plugins/index-basic/build.xml new file mode 100755 index 0000000..a834290 --- /dev/null +++ b/nutch-plugins/index-basic/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="index-basic" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-basic/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-basic/ivy.xml b/nutch-plugins/index-basic/ivy.xml new file mode 100644 index 0000000..848216e --- /dev/null +++ b/nutch-plugins/index-basic/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-basic/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-basic/plugin.xml b/nutch-plugins/index-basic/plugin.xml new file mode 100755 index 0000000..c5d784d --- /dev/null +++ b/nutch-plugins/index-basic/plugin.xml @@ -0,0 +1,42 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="index-basic" + name="Basic Indexing Filter" + version="1.0.0" + provider-name="nutch.org"> + + + <runtime> + <library name="index-basic.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.indexer.basic" + name="Nutch Basic Indexing Filter" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="BasicIndexingFilter" + class="org.apache.nutch.indexer.basic.BasicIndexingFilter"/> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-basic/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-basic/pom.xml b/nutch-plugins/index-basic/pom.xml new file mode 100644 index 0000000..3dc3d91 --- /dev/null +++ b/nutch-plugins/index-basic/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>index-basic</artifactId> + <packaging>jar</packaging> + + <name>index-basic</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java b/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java new file mode 100644 index 0000000..8584fa8 --- /dev/null +++ b/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java @@ -0,0 +1,158 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer.basic; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.parse.Parse; + +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.util.StringUtil; +import org.apache.nutch.util.URLUtil; +import org.apache.hadoop.io.Text; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Date; + +import org.apache.hadoop.conf.Configuration; + +/** + * Adds basic searchable fields to a document. The fields added are : domain, + * host, url, content, title, cache, tstamp domain is included depending on + * {@code indexer.add.domain} in nutch-default.xml. title is truncated as per + * {@code indexer.max.title.length} in nutch-default.xml. (As per NUTCH-1004, a + * zero-length title is not added) content is truncated as per + * {@code indexer.max.content.length} in nutch-default.xml. + */ +public class BasicIndexingFilter implements IndexingFilter { + public static final Logger LOG = LoggerFactory + .getLogger(BasicIndexingFilter.class); + + private int MAX_TITLE_LENGTH; + private int MAX_CONTENT_LENGTH; + private boolean addDomain = false; + private Configuration conf; + + /** + * The {@link BasicIndexingFilter} filter object which supports few + * configuration settings for adding basic searchable fields. See + * {@code indexer.add.domain}, {@code indexer.max.title.length}, + * {@code indexer.max.content.length} in nutch-default.xml. + * + * @param doc + * The {@link NutchDocument} object + * @param parse + * The relevant {@link Parse} object passing through the filter + * @param url + * URL to be filtered for anchor text + * @param datum + * The {@link CrawlDatum} entry + * @param inlinks + * The {@link Inlinks} containing anchor text + * @return filtered NutchDocument + */ + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); + String reprUrlString = reprUrl != null ? reprUrl.toString() : null; + String urlString = url.toString(); + + String host = null; + try { + URL u; + if (reprUrlString != null) { + u = new URL(reprUrlString); + } else { + u = new URL(urlString); + } + + if (addDomain) { + doc.add("domain", URLUtil.getDomainName(u)); + } + + host = u.getHost(); + } catch (MalformedURLException e) { + throw new IndexingException(e); + } + + if (host != null) { + doc.add("host", host); + } + + doc.add("url", reprUrlString == null ? urlString : reprUrlString); + + // content + String content = parse.getText(); + if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) { + content = content.substring(0, MAX_CONTENT_LENGTH); + } + doc.add("content", StringUtil.cleanField(content)); + + // title + String title = parse.getData().getTitle(); + if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate + // title + // if + // needed + title = title.substring(0, MAX_TITLE_LENGTH); + } + + if (title.length() > 0) { + // NUTCH-1004 Do not index empty values for title field + doc.add("title", StringUtil.cleanField(title)); + } + + // add cached content/summary display policy, if available + String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY); + if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) { + doc.add("cache", caching); + } + + // add timestamp when fetched, for deduplication + doc.add("tstamp", new Date(datum.getFetchTime())); + + return doc; + } + + /** + * Set the {@link Configuration} object + */ + public void setConf(Configuration conf) { + this.conf = conf; + this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100); + this.addDomain = conf.getBoolean("indexer.add.domain", false); + this.MAX_CONTENT_LENGTH = conf.getInt("indexer.max.content.length", -1); + } + + /** + * Get the {@link Configuration} object + */ + public Configuration getConf() { + return this.conf; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/package.html b/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/package.html new file mode 100644 index 0000000..3fae405 --- /dev/null +++ b/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/package.html @@ -0,0 +1,5 @@ +<html> +<body> +<p>A basic indexing plugin, adds basic fields: url, host, title, content, etc.</p><p></p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java b/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java new file mode 100644 index 0000000..4bc317e --- /dev/null +++ b/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.basic; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.basic.BasicIndexingFilter; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +import java.util.Date; + +/** + * JUnit test case which tests 1. that basic searchable fields are added to a + * document 2. that domain is added as per {@code indexer.add.domain} in + * nutch-default.xml. 3. that title is truncated as per + * {@code indexer.max.title.length} in nutch-default.xml. 4. that content is + * truncated as per {@code indexer.max.content.length} in nutch-default.xml. + * + * @author tejasp + * + */ + +public class TestBasicIndexingFilter { + + @Test + public void testBasicIndexingFilter() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.setInt("indexer.max.title.length", 10); + conf.setBoolean("indexer.add.domain", true); + conf.setInt("indexer.max.content.length", 20); + + BasicIndexingFilter filter = new BasicIndexingFilter(); + filter.setConf(conf); + Assert.assertNotNull(filter); + + NutchDocument doc = new NutchDocument(); + + String title = "The Foo Page"; + Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") }; + Metadata metaData = new Metadata(); + metaData.add("Language", "en/us"); + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, + outlinks, metaData); + ParseImpl parse = new ParseImpl( + "this is a sample foo bar page. hope you enjoy it.", parseData); + + CrawlDatum crawlDatum = new CrawlDatum(); + crawlDatum.setFetchTime(100L); + + Inlinks inlinks = new Inlinks(); + + try { + filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), + crawlDatum, inlinks); + } catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.getMessage()); + } + Assert.assertNotNull(doc); + Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc + .getField("title").getValues().get(0)); + Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc + .getField("domain").getValues().get(0)); + Assert.assertEquals("test host, expect \"nutch.apache.org\"", + "nutch.apache.org", doc.getField("host").getValues().get(0)); + Assert.assertEquals( + "test url, expect \"http://nutch.apache.org/index.html\"", + "http://nutch.apache.org/index.html", doc.getField("url").getValues() + .get(0)); + Assert.assertEquals("test content", "this is a sample foo", + doc.getField("content").getValues().get(0)); + Assert.assertEquals("test fetch time", new Date(100L), + (Date) doc.getField("tstamp").getValues().get(0)); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/build-ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-geoip/build-ivy.xml b/nutch-plugins/index-geoip/build-ivy.xml new file mode 100644 index 0000000..2cda7e9 --- /dev/null +++ b/nutch-plugins/index-geoip/build-ivy.xml @@ -0,0 +1,54 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="index-geoip" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> + + <property name="ivy.install.version" value="2.1.0" /> + <condition property="ivy.home" value="${env.IVY_HOME}"> + <isset property="env.IVY_HOME" /> + </condition> + <property name="ivy.home" value="${user.home}/.ant" /> + <property name="ivy.checksums" value="" /> + <property name="ivy.jar.dir" value="${ivy.home}/lib" /> + <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" /> + + <target name="download-ivy" unless="offline"> + + <mkdir dir="${ivy.jar.dir}"/> + <!-- download Ivy from web site so that it can be used even without any special installation --> + <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" + dest="${ivy.jar.file}" usetimestamp="true"/> + </target> + + <target name="init-ivy" depends="download-ivy"> + <!-- try to load ivy here from ivy home, in case the user has not already dropped + it into ant's lib dir (note that the latter copy will always take precedence). + We will not fail as long as local lib dir exists (it may be empty) and + ivy is in at least one of ant's lib dir or the local lib dir. --> + <path id="ivy.lib.path"> + <fileset dir="${ivy.jar.dir}" includes="*.jar"/> + + </path> + <taskdef resource="org/apache/ivy/ant/antlib.xml" + uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/> + </target> + + <target name="deps-jar" depends="init-ivy"> + <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/> + </target> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-geoip/build.xml b/nutch-plugins/index-geoip/build.xml new file mode 100644 index 0000000..92fda82 --- /dev/null +++ b/nutch-plugins/index-geoip/build.xml @@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="index-geoip" default="jar-core"> + + <import file="../build-plugin.xml"/> + <target name="init-plugin"> + <echo>Copying MaxMind GeoIP .mmdb files to build</echo> + <copy todir="${build.classes}"> + <fileset dir="${src.dir}" includes="**/*.mmdb" /> + </copy> + </target> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-geoip/ivy.xml b/nutch-plugins/index-geoip/ivy.xml new file mode 100644 index 0000000..1b626f0 --- /dev/null +++ b/nutch-plugins/index-geoip/ivy.xml @@ -0,0 +1,46 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.3.1" > + <!-- Exlude due to classpath issues --> + <exclude org="org.apache.httpcomponents" name="httpclient" /> + <exclude org="org.apache.httpcomponents" name="httpcore" /> + </dependency> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-geoip/plugin.xml b/nutch-plugins/index-geoip/plugin.xml new file mode 100644 index 0000000..214fbd0 --- /dev/null +++ b/nutch-plugins/index-geoip/plugin.xml @@ -0,0 +1,51 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="index-geoip" + name="GeoIP2 Indexing Filter" + version="1.0.0" + provider-name="nutch.org"> + + + <runtime> + <library name="index-geoip.jar"> + <export name="*"/> + </library> + <library name="commons-codec-1.6.jar"/> + <library name="commons-logging-1.1.1.jar"/> + <library name="geoip2-2.3.1.jar"/> + <library name="google-http-client-1.20.0.jar"/> + <library name="jackson-annotations-2.5.0.jar"/> + <library name="jackson-core-2.5.3.jar"/> + <library name="jackson-databind-2.5.3.jar"/> + <library name="jsr305-1.3.9.jar"/> + <library name="maxmind-db-1.0.0.jar"/> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.indexer.geoip" + name="Nutch GeoIP2 Indexing Filter" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="GeoIPIndexingFilter" + class="org.apache.nutch.indexer.geoip.GeoIPIndexingFilter"/> + </extension> + +</plugin> + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-geoip/pom.xml b/nutch-plugins/index-geoip/pom.xml new file mode 100644 index 0000000..1238982 --- /dev/null +++ b/nutch-plugins/index-geoip/pom.xml @@ -0,0 +1,55 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>index-geoip</artifactId> + <packaging>jar</packaging> + + <name>index-geoip</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + <dependencies> + <dependency> + <groupId>com.maxmind.geoip2</groupId> + <artifactId>geoip2</artifactId> + <version>2.3.1</version> + <exclusions> + <exclusion> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>httpclient</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>httpcore</artifactId> + </exclusion> + </exclusions> + </dependency> + </dependencies> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java new file mode 100644 index 0000000..88d78ef --- /dev/null +++ b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java @@ -0,0 +1,210 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.geoip; + +import java.io.IOException; +import java.net.InetAddress; +import java.net.UnknownHostException; + +import org.apache.nutch.indexer.NutchDocument; + +import com.maxmind.geoip2.DatabaseReader; +import com.maxmind.geoip2.WebServiceClient; +import com.maxmind.geoip2.exception.GeoIp2Exception; +import com.maxmind.geoip2.model.InsightsResponse; +import com.maxmind.geoip2.model.CityResponse; +import com.maxmind.geoip2.model.ConnectionTypeResponse; +import com.maxmind.geoip2.model.CountryResponse; +import com.maxmind.geoip2.model.DomainResponse; +import com.maxmind.geoip2.model.IspResponse; +import com.maxmind.geoip2.record.City; +import com.maxmind.geoip2.record.Continent; +import com.maxmind.geoip2.record.Country; +import com.maxmind.geoip2.record.Location; +import com.maxmind.geoip2.record.Postal; +import com.maxmind.geoip2.record.RepresentedCountry; +import com.maxmind.geoip2.record.Subdivision; +import com.maxmind.geoip2.record.Traits; + +/** + * <p> + * Simple utility class which enables efficient, structured + * {@link org.apache.nutch.indexer.NutchDocument} building based on input from + * {@link GeoIPIndexingFilter}, where configuration is also read. + * </p> + * <p> + * Based on the nature of the input, this class wraps factory type + * implementations for populating {@link org.apache.nutch.indexer.NutchDocument} + * 's with the correct {@link org.apache.nutch.indexer.NutchField} information. + * + */ +public class GeoIPDocumentCreator { + + /** + * Default constructor. + */ + public GeoIPDocumentCreator() { + } + + public static NutchDocument createDocFromInsightsService(String serverIp, + NutchDocument doc, WebServiceClient client) throws UnknownHostException, + IOException, GeoIp2Exception { + doc.add("ip", serverIp); + InsightsResponse response = client + .insights(InetAddress.getByName(serverIp)); + // CityResponse response = client.city(InetAddress.getByName(serverIp)); + + City city = response.getCity(); + doc.add("cityName", city.getName()); // 'Minneapolis' + doc.add("cityConfidence", city.getConfidence()); // 50 + doc.add("cityGeoNameId", city.getGeoNameId()); + + Continent continent = response.getContinent(); + doc.add("continentCode", continent.getCode()); + doc.add("continentGeoNameId", continent.getGeoNameId()); + doc.add("continentName", continent.getName()); + + Country country = response.getCountry(); + doc.add("countryIsoCode", country.getIsoCode()); // 'US' + doc.add("countryName", country.getName()); // 'United States' + doc.add("countryConfidence", country.getConfidence()); // 99 + doc.add("countryGeoName", country.getGeoNameId()); + + Location location = response.getLocation(); + doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733, + // -93.2323 + doc.add("accRadius", location.getAccuracyRadius()); // 3 + doc.add("timeZone", location.getTimeZone()); // 'America/Chicago' + doc.add("metroCode", location.getMetroCode()); + + Postal postal = response.getPostal(); + doc.add("postalCode", postal.getCode()); // '55455' + doc.add("postalConfidence", postal.getConfidence()); // 40 + + RepresentedCountry rCountry = response.getRepresentedCountry(); + doc.add("countryType", rCountry.getType()); + + Subdivision subdivision = response.getMostSpecificSubdivision(); + doc.add("subDivName", subdivision.getName()); // 'Minnesota' + doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN' + doc.add("subDivConfidence", subdivision.getConfidence()); // 90 + doc.add("subDivGeoNameId", subdivision.getGeoNameId()); + + Traits traits = response.getTraits(); + doc.add("autonSystemNum", traits.getAutonomousSystemNumber()); + doc.add("autonSystemOrg", traits.getAutonomousSystemOrganization()); + doc.add("domain", traits.getDomain()); + doc.add("isp", traits.getIsp()); + doc.add("org", traits.getOrganization()); + doc.add("userType", traits.getUserType()); + doc.add("isAnonProxy", traits.isAnonymousProxy()); + doc.add("isSatelliteProv", traits.isSatelliteProvider()); + return doc; + } + + @SuppressWarnings("unused") + public static NutchDocument createDocFromCityService(String serverIp, + NutchDocument doc, WebServiceClient client) throws UnknownHostException, + IOException, GeoIp2Exception { + CityResponse response = client.city(InetAddress.getByName(serverIp)); + return doc; + } + + @SuppressWarnings("unused") + public static NutchDocument createDocFromCountryService(String serverIp, + NutchDocument doc, WebServiceClient client) throws UnknownHostException, + IOException, GeoIp2Exception { + CountryResponse response = client.country(InetAddress.getByName(serverIp)); + return doc; + } + + public static NutchDocument createDocFromIspDb(String serverIp, + NutchDocument doc, DatabaseReader reader) throws UnknownHostException, + IOException, GeoIp2Exception { + IspResponse response = reader.isp(InetAddress.getByName(serverIp)); + doc.add("ip", serverIp); + doc.add("autonSystemNum", response.getAutonomousSystemNumber()); + doc.add("autonSystemOrg", response.getAutonomousSystemOrganization()); + doc.add("isp", response.getIsp()); + doc.add("org", response.getOrganization()); + return doc; + } + + public static NutchDocument createDocFromDomainDb(String serverIp, + NutchDocument doc, DatabaseReader reader) throws UnknownHostException, + IOException, GeoIp2Exception { + DomainResponse response = reader.domain(InetAddress.getByName(serverIp)); + doc.add("ip", serverIp); + doc.add("domain", response.getDomain()); + return doc; + } + + public static NutchDocument createDocFromConnectionDb(String serverIp, + NutchDocument doc, DatabaseReader reader) throws UnknownHostException, + IOException, GeoIp2Exception { + ConnectionTypeResponse response = reader.connectionType(InetAddress + .getByName(serverIp)); + doc.add("ip", serverIp); + doc.add("connType", response.getConnectionType().toString()); + return doc; + } + + public static NutchDocument createDocFromCityDb(String serverIp, + NutchDocument doc, DatabaseReader reader) throws UnknownHostException, + IOException, GeoIp2Exception { + doc.add("ip", serverIp); + CityResponse response = reader.city(InetAddress.getByName(serverIp)); + + City city = response.getCity(); + doc.add("cityName", city.getName()); // 'Minneapolis' + doc.add("cityConfidence", city.getConfidence()); // 50 + doc.add("cityGeoNameId", city.getGeoNameId()); + + Continent continent = response.getContinent(); + doc.add("continentCode", continent.getCode()); + doc.add("continentGeoNameId", continent.getGeoNameId()); + doc.add("continentName", continent.getName()); + + Country country = response.getCountry(); + doc.add("countryIsoCode", country.getIsoCode()); // 'US' + doc.add("countryName", country.getName()); // 'United States' + doc.add("countryConfidence", country.getConfidence()); // 99 + doc.add("countryGeoName", country.getGeoNameId()); + + Location location = response.getLocation(); + doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733, + // -93.2323 + doc.add("accRadius", location.getAccuracyRadius()); // 3 + doc.add("timeZone", location.getTimeZone()); // 'America/Chicago' + doc.add("metroCode", location.getMetroCode()); + + Postal postal = response.getPostal(); + doc.add("postalCode", postal.getCode()); // '55455' + doc.add("postalConfidence", postal.getConfidence()); // 40 + + RepresentedCountry rCountry = response.getRepresentedCountry(); + doc.add("countryType", rCountry.getType()); + + Subdivision subdivision = response.getMostSpecificSubdivision(); + doc.add("subDivName", subdivision.getName()); // 'Minnesota' + doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN' + doc.add("subDivConfidence", subdivision.getConfidence()); // 90 + doc.add("subDivGeoNameId", subdivision.getGeoNameId()); + return doc; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java new file mode 100644 index 0000000..f515f1f --- /dev/null +++ b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java @@ -0,0 +1,241 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.geoip; + +import java.io.File; +import java.io.IOException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.maxmind.geoip2.DatabaseReader; +import com.maxmind.geoip2.WebServiceClient; + +/** + * <p> + * This plugin implements an indexing filter which takes advantage of the <a + * href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>. + * </p> + * <p> + * The third party library distribution provides an API for the GeoIP2 <a + * href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web + * services</a> and <a + * href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. The + * API also works with the free <a + * href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>. + * </p> + * <p> + * Depending on the service level agreement, you have with the GeoIP service + * provider, the plugin can add a number of the following fields to the index + * data model: + * <ol> + * <li>Continent</li> + * <li>Country</li> + * <li>Regional Subdivision</li> + * <li>City</li> + * <li>Postal Code</li> + * <li>Latitude/Longitude</li> + * <li>ISP/Organization</li> + * <li>AS Number</li> + * <li>Confidence Factors</li> + * <li>Radius</li> + * <li>User Type</li> + * </ol> + * </p> + * + * <p> + * Some of the services are documented at the <a + * href="https://www.maxmind.com/en/geoip2-precision-services">GeoIP2 Precision + * Services</a> webpage where more information can be obtained. + * </p> + * + * <p> + * You should also consult the following three properties in + * <code>nutch-site.xml</code> + * </p> + * + * <pre> + * {@code + * <!-- index-geoip plugin properties --> + * <property> + * <name>index.geoip.usage</name> + * <value>insightsService</value> + * <description> + * A string representing the information source to be used for GeoIP information + * association. Either enter 'cityDatabase', 'connectionTypeDatabase', + * 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the + * Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, + * GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath + * and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf + * </description> + * </property> + * + * <property> + * <name>index.geoip.userid</name> + * <value></value> + * <description> + * The userId associated with the GeoIP2 Precision Services account. + * </description> + * </property> + * + * <property> + * <name>index.geoip.licensekey</name> + * <value></value> + * <description> + * The license key associated with the GeoIP2 Precision Services account. + * </description> + * </property> + * } + * </pre> + * + */ +public class GeoIPIndexingFilter implements IndexingFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(GeoIPIndexingFilter.class); + + private Configuration conf; + + private String usage = null; + + private File geoDb = null; + + WebServiceClient client = null; + + DatabaseReader reader = null; + + // private AbstractResponse response = null; + + /** + * Default constructor for this plugin + */ + public GeoIPIndexingFilter() { + } + + /** + * @see org.apache.hadoop.conf.Configurable#getConf() + */ + @Override + public Configuration getConf() { + return this.conf; + } + + /** + * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration) + */ + @Override + public void setConf(Configuration conf) { + this.conf = conf; + String use = conf.get("index.geoip.usage", "insightsService"); + LOG.debug("GeoIP usage medium set to: {}", use); + if (use.equalsIgnoreCase("cityDatabase")) { + try { + geoDb = new File(conf.getResource("GeoIP2-City.mmdb").getFile()); + buildDb(); + } catch (Exception e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } else if (use.equalsIgnoreCase("connectionTypeDatabase")) { + try { + geoDb = new File(conf.getResource("GeoIP2-Connection-Type.mmdb") + .getFile()); + buildDb(); + } catch (Exception e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } else if (use.equalsIgnoreCase("domainDatabase")) { + try { + geoDb = new File(conf.getResource("GeoIP2-Domain.mmdb").getFile()); + buildDb(); + } catch (Exception e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } else if (use.equalsIgnoreCase("ispDatabase")) { + try { + geoDb = new File(conf.getResource("GeoIP2-ISP.mmdb").getFile()); + buildDb(); + } catch (Exception e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } else if (use.equalsIgnoreCase("insightsService")) { + client = new WebServiceClient.Builder(conf.getInt("index.geoip.userid", + 12345), conf.get("index.geoip.licensekey")).build(); + } + usage = use; + } + + private void buildDb() { + try { + reader = new DatabaseReader.Builder(geoDb).build(); + } catch (IOException e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } + + /** + * + * @see org.apache.nutch.indexer.IndexingFilter#filter(org.apache.nutch.indexer.NutchDocument, + * org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text, + * org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks) + */ + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + return addServerGeo(doc, parse.getData(), url.toString()); + } + + private NutchDocument addServerGeo(NutchDocument doc, ParseData data, + String url) { + + if (conf.getBoolean("store.ip.address", false) == true) { + try { + String serverIp = data.getContentMeta().get("_ip_"); + if (serverIp != null) { + if (usage.equalsIgnoreCase("cityDatabase")) { + doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc, + reader); + } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) { + doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc, + reader); + } else if (usage.equalsIgnoreCase("domainDatabase")) { + doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc, + reader); + } else if (usage.equalsIgnoreCase("ispDatabase")) { + doc = GeoIPDocumentCreator + .createDocFromIspDb(serverIp, doc, reader); + } else if (usage.equalsIgnoreCase("insightsService")) { + doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp, + doc, client); + } + } + } catch (Exception e) { + LOG.error(e.getMessage()); + e.printStackTrace(); + } + } + return doc; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/package-info.java b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/package-info.java new file mode 100644 index 0000000..ba62519 --- /dev/null +++ b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/package-info.java @@ -0,0 +1,28 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * <p>This plugin implements an indexing filter which takes + * advantage of the + * <a href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.</p> + * <p>The third party library distribution provides an API for the GeoIP2 + * <a href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web services</a> + * and <a href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. + * The API also works with the free + * <a href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>. + * + */ +package org.apache.nutch.indexer.geoip; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-links/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-links/build.xml b/nutch-plugins/index-links/build.xml new file mode 100644 index 0000000..b853ccf --- /dev/null +++ b/nutch-plugins/index-links/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="index-links" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-links/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-links/ivy.xml b/nutch-plugins/index-links/ivy.xml new file mode 100644 index 0000000..0a363f7 --- /dev/null +++ b/nutch-plugins/index-links/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="${nutch.root}/ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-links/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-links/plugin.xml b/nutch-plugins/index-links/plugin.xml new file mode 100644 index 0000000..dfdc5d2 --- /dev/null +++ b/nutch-plugins/index-links/plugin.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="index-links" + name="Index inlinks and outlinks" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="index-links.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.indexer.links.LinksIndexingFilter" + name="Links indexing filter" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="org.apache.nutch.indexer.links.LinksIndexingFilter" + class="org.apache.nutch.indexer.links.LinksIndexingFilter"/> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-links/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-links/pom.xml b/nutch-plugins/index-links/pom.xml new file mode 100644 index 0000000..e5e3a7f --- /dev/null +++ b/nutch-plugins/index-links/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>index-links</artifactId> + <packaging>jar</packaging> + + <name>index-links</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-links/src/main/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-links/src/main/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java b/nutch-plugins/index-links/src/main/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java new file mode 100644 index 0000000..975df66 --- /dev/null +++ b/nutch-plugins/index-links/src/main/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java @@ -0,0 +1,167 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.links; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlink; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.slf4j.LoggerFactory; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +/** + * An {@link org.apache.nutch.indexer.IndexingFilter} that adds + * <code>outlinks</code> and <code>inlinks</code> field(s) to the document. + * + * In case that you want to ignore the outlinks that point to the same host + * as the URL being indexed use the following settings in your configuration + * file: + * + * <property> + * <name>index.links.outlinks.host.ignore</name> + * <value>true</value> + * </property> + * + * The same configuration is available for inlinks: + * + * <property> + * <name>index.links.inlinks.host.ignore</name> + * <value>true</value> + * </property> + * + * To store only the host portion of each inlink URL or outlink URL add the + * following to your configuration file. + * + * <property> + * <name>index.links.hosts.only</name> + * <value>false</value> + * </property> + * + */ +public class LinksIndexingFilter implements IndexingFilter { + + public final static String LINKS_OUTLINKS_HOST = "index.links.outlinks.host.ignore"; + public final static String LINKS_INLINKS_HOST = "index.links.inlinks.host.ignore"; + public final static String LINKS_ONLY_HOSTS = "index.links.hosts.only"; + + public final static org.slf4j.Logger LOG = LoggerFactory + .getLogger(LinksIndexingFilter.class); + + private Configuration conf; + private boolean filterOutlinks; + private boolean filterInlinks; + private boolean indexHost; + + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + // Add the outlinks + Outlink[] outlinks = parse.getData().getOutlinks(); + + if (outlinks != null) { + Set<String> hosts = new HashSet<String>(); + + for (Outlink outlink : outlinks) { + try { + String linkUrl = outlink.getToUrl(); + String outHost = new URL(linkUrl).getHost().toLowerCase(); + + if (indexHost) { + linkUrl = outHost; + + if (hosts.contains(linkUrl)) + continue; + + hosts.add(linkUrl); + } + + addFilteredLink("outlinks", url.toString(), linkUrl, outHost, + filterOutlinks, doc); + } catch (MalformedURLException e) { + LOG.error("Malformed URL in {}: {}", url, e.getMessage()); + } + } + } + + // Add the inlinks + if (null != inlinks) { + Iterator<Inlink> iterator = inlinks.iterator(); + Set<String> inlinkHosts = new HashSet<String>(); + + while (iterator.hasNext()) { + try { + Inlink link = iterator.next(); + String linkUrl = link.getFromUrl(); + String inHost = new URL(linkUrl).getHost().toLowerCase(); + + if (indexHost) { + linkUrl = inHost; + + if (inlinkHosts.contains(linkUrl)) + continue; + + inlinkHosts.add(linkUrl); + } + + addFilteredLink("inlinks", url.toString(), linkUrl, inHost, + filterInlinks, doc); + } catch (MalformedURLException e) { + LOG.error("Malformed URL in {}: {}", url, e.getMessage()); + } + } + } + + return doc; + } + + private void addFilteredLink(String fieldName, String url, String linkUrl, + String urlHost, boolean filter, NutchDocument doc) throws MalformedURLException { + if (filter) { + String host = new URL(url.toString()).getHost().toLowerCase(); + + if (!host.equalsIgnoreCase(urlHost)) { + doc.add(fieldName, linkUrl); + } + } else { + doc.add(fieldName, linkUrl); + } + } + + public void setConf(Configuration conf) { + this.conf = conf; + filterOutlinks = conf.getBoolean(LINKS_OUTLINKS_HOST, false); + filterInlinks = conf.getBoolean(LINKS_INLINKS_HOST, false); + + indexHost = conf.getBoolean(LINKS_ONLY_HOSTS, false); + } + + public Configuration getConf() { + return this.conf; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java b/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java new file mode 100644 index 0000000..c490d1f --- /dev/null +++ b/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java @@ -0,0 +1,218 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.links; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlink; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.NutchField; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.util.NutchConfiguration; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.net.URL; +import java.util.Iterator; + +public class TestLinksIndexingFilter { + + Configuration conf = NutchConfiguration.create(); + LinksIndexingFilter filter = new LinksIndexingFilter(); + Metadata metadata = new Metadata(); + + @Before + public void setUp() throws Exception { + metadata.add(Response.CONTENT_TYPE, "text/html"); + } + + private Outlink[] generateOutlinks() throws Exception { + return generateOutlinks(false); + } + + private Outlink[] generateOutlinks(boolean parts) throws Exception { + Outlink[] outlinks = new Outlink[2]; + + outlinks[0] = new Outlink("http://www.test.com", "test"); + outlinks[1] = new Outlink("http://www.example.com", "example"); + + if (parts) { + outlinks[0] = new Outlink(outlinks[0].getToUrl() + "/index.php?param=1", + "test"); + outlinks[1] = new Outlink(outlinks[1].getToUrl() + "/index.php?param=2", + "test"); + } + + return outlinks; + } + + @Test + public void testFilterOutlinks() throws Exception { + conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true"); + filter.setConf(conf); + + Outlink[] outlinks = generateOutlinks(); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", outlinks, metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + Assert.assertEquals(1, doc.getField("outlinks").getValues().size()); + + Assert.assertEquals("Filter outlinks, allow only those from a different host", + outlinks[0].getToUrl(), doc.getFieldValue("outlinks")); + } + + @Test + public void testFilterInlinks() throws Exception { + conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true"); + filter.setConf(conf); + + Inlinks inlinks = new Inlinks(); + inlinks.add(new Inlink("http://www.test.com", "test")); + inlinks.add(new Inlink("http://www.example.com", "example")); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), inlinks); + + Assert.assertEquals(1, doc.getField("inlinks").getValues().size()); + + Assert.assertEquals("Filter inlinks, allow only those from a different host", + "http://www.test.com", doc.getFieldValue("inlinks")); + } + + @Test + public void testNoFilterOutlinks() throws Exception { + filter.setConf(conf); + + Outlink[] outlinks = generateOutlinks(); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", outlinks, metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + Assert.assertEquals("All outlinks must be indexed even those from the same host", + outlinks.length, doc.getField("outlinks").getValues().size()); + } + + @Test + public void testNoFilterInlinks() throws Exception { + conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false"); + filter.setConf(conf); + + Inlinks inlinks = new Inlinks(); + inlinks.add(new Inlink("http://www.test.com", "test")); + inlinks.add(new Inlink("http://www.example.com", "example")); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), inlinks); + + Assert.assertEquals("All inlinks must be indexed even those from the same host", + inlinks.size(), doc.getField("inlinks").getValues().size()); + } + + @Test + public void testIndexOnlyHostPart() throws Exception { + conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true"); + conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true"); + conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true"); + filter.setConf(conf); + + Outlink[] outlinks = generateOutlinks(true); + + Inlinks inlinks = new Inlinks(); + inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test")); + inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test")); + inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example", + "example")); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", outlinks, metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), inlinks); + + NutchField docOutlinks = doc.getField("outlinks"); + + Assert.assertEquals("Only the host portion of the outlink URL must be indexed", + new URL("http://www.test.com").getHost(), + docOutlinks.getValues().get(0)); + + Assert.assertEquals( + "The inlinks coming from the same host must count only once", 1, + doc.getField("inlinks").getValues().size()); + + Assert.assertEquals("Only the host portion of the inlinks URL must be indexed", + new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks")); + } + + @Test + public void testIndexHostsOnlyAndFilterOutlinks() throws Exception { + conf = NutchConfiguration.create(); + conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true"); + conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true"); + + Outlink[] outlinks = generateOutlinks(true); + + filter.setConf(conf); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", outlinks, metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + Assert.assertEquals(1, doc.getField("outlinks").getValues().size()); + + Assert.assertEquals( + "Index only the host portion of the outlinks after filtering", + new URL("http://www.test.com").getHost(), + doc.getFieldValue("outlinks")); + } + + @Test + public void testIndexHostsOnlyAndFilterInlinks() throws Exception { + conf = NutchConfiguration.create(); + conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true"); + conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true"); + + filter.setConf(conf); + + Inlinks inlinks = new Inlinks(); + inlinks.add(new Inlink("http://www.test.com", "test")); + inlinks.add(new Inlink("http://www.example.com", "example")); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), inlinks); + + Assert.assertEquals(1, doc.getField("inlinks").getValues().size()); + + Assert.assertEquals( + "Index only the host portion of the inlinks after filtering", + new URL("http://www.test.com").getHost(), + doc.getFieldValue("inlinks")); + + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java b/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java new file mode 100644 index 0000000..aaaedbf --- /dev/null +++ b/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java @@ -0,0 +1,54 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse; + +import org.junit.Test; + +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.*; + +public class TestOutlinks { + + @Test + public void testAddSameObject() throws Exception { + Set<Outlink> set = new HashSet<>(); + + Outlink o = new Outlink("http://www.example.com", "Example"); + set.add(o); + set.add(o); + + assertEquals("Adding the same Outlink twice", 1, set.size()); + } + + @Test + public void testAddOtherObjectWithSameData() throws Exception { + Set<Outlink> set = new HashSet<>(); + + Outlink o = new Outlink("http://www.example.com", "Example"); + Outlink o1 = new Outlink("http://www.example.com", "Example"); + + assertTrue("The two Outlink objects are the same", o.equals(o1)); + + set.add(o); + set.add(o1); + + assertEquals("The set should contain only 1 Outlink", 1, set.size()); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-metadata/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-metadata/build.xml b/nutch-plugins/index-metadata/build.xml new file mode 100644 index 0000000..ad96d11 --- /dev/null +++ b/nutch-plugins/index-metadata/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="index-metadata" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-metadata/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-metadata/ivy.xml b/nutch-plugins/index-metadata/ivy.xml new file mode 100644 index 0000000..24d7606 --- /dev/null +++ b/nutch-plugins/index-metadata/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../../ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-metadata/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-metadata/plugin.xml b/nutch-plugins/index-metadata/plugin.xml new file mode 100644 index 0000000..4d4c9a7 --- /dev/null +++ b/nutch-plugins/index-metadata/plugin.xml @@ -0,0 +1,42 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="index-metadata" + name="Index Metadata" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="index-metadata.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + + <extension id="org.apache.nutch.indexer.metadata" + name="Nutch metadata indexer" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="MetadataIndexer" + class="org.apache.nutch.indexer.metadata.MetadataIndexer"/> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-metadata/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-metadata/pom.xml b/nutch-plugins/index-metadata/pom.xml new file mode 100644 index 0000000..bef1b9a --- /dev/null +++ b/nutch-plugins/index-metadata/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>index-metadata</artifactId> + <packaging>jar</packaging> + + <name>index-metadata</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java new file mode 100644 index 0000000..78718aa --- /dev/null +++ b/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer.metadata; + +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.Parse; + +/** + * Indexer which can be configured to extract metadata from the crawldb, parse + * metadata or content metadata. You can specify the properties "index.db.md", + * "index.parse.md" or "index.content.md" who's values are comma-delimited + * <value>key1,key2,key3</value>. + */ +public class MetadataIndexer implements IndexingFilter { + private Configuration conf; + private String[] dbFieldnames; + private Map<String, String> parseFieldnames; + private String[] contentFieldnames; + private static final String db_CONF_PROPERTY = "index.db.md"; + private static final String parse_CONF_PROPERTY = "index.parse.md"; + private static final String content_CONF_PROPERTY = "index.content.md"; + + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + // just in case + if (doc == null) + return doc; + + // add the fields from crawldb + if (dbFieldnames != null) { + for (String metatag : dbFieldnames) { + Text metadata = (Text) datum.getMetaData().get(new Text(metatag)); + if (metadata != null) + doc.add(metatag, metadata.toString()); + } + } + + // add the fields from parsemd + if (parseFieldnames != null) { + for (String metatag : parseFieldnames.keySet()) { + for (String value : parse.getData().getParseMeta().getValues(metatag)) { + if (value != null) + doc.add(parseFieldnames.get(metatag), value); + } + } + } + + // add the fields from contentmd + if (contentFieldnames != null) { + for (String metatag : contentFieldnames) { + for (String value : parse.getData().getContentMeta().getValues(metatag)) { + if (value != null) + doc.add(metatag, value); + } + } + } + + return doc; + } + + public void setConf(Configuration conf) { + this.conf = conf; + dbFieldnames = conf.getStrings(db_CONF_PROPERTY); + parseFieldnames = new HashMap<String, String>(); + for (String metatag : conf.getStrings(parse_CONF_PROPERTY)) { + parseFieldnames.put(metatag.toLowerCase(Locale.ROOT), metatag); + } + contentFieldnames = conf.getStrings(content_CONF_PROPERTY); + + // TODO check conflict between field names e.g. could have same label + // from different sources + + } + + public Configuration getConf() { + return this.conf; + } +}
