Author: lewismc
Date: Sat Jan 10 23:24:58 2015
New Revision: 1650829
URL: http://svn.apache.org/r1650829
Log:
NUTCH-1660 Index filter for Page's latitude and longitudex
Added:
nutch/trunk/src/plugin/index-geoip/
nutch/trunk/src/plugin/index-geoip/build.xml
nutch/trunk/src/plugin/index-geoip/ivy.xml
nutch/trunk/src/plugin/index-geoip/plugin.xml
nutch/trunk/src/plugin/index-geoip/src/
nutch/trunk/src/plugin/index-geoip/src/java/
nutch/trunk/src/plugin/index-geoip/src/java/org/
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
nutch/trunk/src/plugin/index-geoip/src/test/
nutch/trunk/src/plugin/index-geoip/src/test/org/
nutch/trunk/src/plugin/index-geoip/src/test/org/apache/
nutch/trunk/src/plugin/index-geoip/src/test/org/apache/nutch/
nutch/trunk/src/plugin/index-geoip/src/test/org/apache/nutch/indexer/
nutch/trunk/src/plugin/index-geoip/src/test/org/apache/nutch/indexer/geoip/
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/build.xml
nutch/trunk/conf/nutch-default.xml
nutch/trunk/conf/schema-solr4.xml
nutch/trunk/conf/schema.xml
nutch/trunk/conf/solrindex-mapping.xml
nutch/trunk/default.properties
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/trunk/src/plugin/build.xml
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Jan 10 23:24:58 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1660 Index filter for Page's latitude and longitude (Yasin Kılınç,
lewismc)
+
* NUTCH-1140 index-more plugin, resetTitle creates multiple values in title
field (Joe Liedtke, kaveh minooie via snagel)
* NUTCH-1904 Schema for Solr4 doesn't include _version_ field (mattmann)
Modified: nutch/trunk/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Sat Jan 10 23:24:58 2015
@@ -176,6 +176,7 @@
<packageset dir="${plugins.dir}/index-basic/src/java"/>
<packageset dir="${plugins.dir}/index-metadata/src/java"/>
<packageset dir="${plugins.dir}/index-more/src/java"/>
+ <packageset dir="${plugins.dir}/index-geoip/src/java"/>
<packageset dir="${plugins.dir}/index-static/src/java"/>
<packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
<packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
@@ -579,6 +580,7 @@
<packageset dir="${plugins.dir}/headings/src/java"/>
<packageset dir="${plugins.dir}/index-anchor/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
+ <packageset dir="${plugins.dir}/index-geoip/src/java"/>
<packageset dir="${plugins.dir}/index-metadata/src/java"/>
<packageset dir="${plugins.dir}/index-more/src/java"/>
<packageset dir="${plugins.dir}/index-static/src/java"/>
@@ -958,6 +960,8 @@
<source path="${plugins.dir}/index-anchor/src/test/" />
<source path="${plugins.dir}/index-basic/src/java/" />
<source path="${plugins.dir}/index-basic/src/test/" />
+ <source path="${plugins.dir}/index-geoip/src/java/" />
+ <source path="${plugins.dir}/index-geoip/src/test/" />
<source path="${plugins.dir}/indexer-dummy/src/java/" />
<source path="${plugins.dir}/indexer-solr/src/java/" />
<source path="${plugins.dir}/indexer-elastic/src/java/" />
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Sat Jan 10 23:24:58 2015
@@ -1373,6 +1373,36 @@
</description>
</property>
+<!-- index-geoip plugin properties -->
+<property>
+ <name>index.geoip.usage</name>
+ <value>insightsService</value>
+ <description>
+ A string representing the information source to be used for GeoIP information
+ association. Either enter 'cityDatabase', 'connectionTypeDatabase',
+ 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any
one of the
+ Database options, you should make one of GeoIP2-City.mmdb,
GeoIP2-Connection-Type.mmdb,
+ GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the
classpath and
+ available at runtime.
+ </description>
+</property>
+
+<property>
+ <name>index.geoip.userid</name>
+ <value></value>
+ <description>
+ The userId associated with the GeoIP2 Precision Services account.
+ </description>
+</property>
+
+<property>
+ <name>index.geoip.licensekey</name>
+ <value></value>
+ <description>
+ The license key associated with the GeoIP2 Precision Services account.
+ </description>
+</property>
+
<!-- parse-metatags plugin properties -->
<property>
<name>metatags.names</name>
Modified: nutch/trunk/conf/schema-solr4.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/schema-solr4.xml?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/conf/schema-solr4.xml (original)
+++ nutch/trunk/conf/schema-solr4.xml Sat Jan 10 23:24:58 2015
@@ -79,7 +79,9 @@
Note: For faster range queries, consider the tdate type
-->
<fieldType name="date" class="solr.TrieDateField" omitNorms="true"
precisionStep="0" positionIncrementGap="0"/>
-
+
+ <fieldType name="location" class="solr.LatLonType"
subFieldSuffix="_coordinate"/>
+
<!-- A Trie based date field for faster date range queries and date
faceting. -->
<fieldType name="tdate" class="solr.TrieDateField" omitNorms="true"
precisionStep="6" positionIncrementGap="0"/>
@@ -298,6 +300,25 @@
any data added to them will be ignored outright. -->
<fieldtype name="ignored" stored="false" indexed="false"
multiValued="true" class="solr.StrField" />
+ <!-- boolean type: "true" or "false" -->
+ <fieldType name="boolean" class="solr.BoolField"
sortMissingLast="true"/>
+
+ <!-- sortMissingLast and sortMissingFirst attributes are optional
attributes are
+ currently supported on types that are sorted internally as strings
+ and on numeric types.
+ This includes "string","boolean", and, as of 3.5 (and 4.x),
+ int, float, long, date, double, including the "Trie" variants.
+ - If sortMissingLast="true", then a sort on this field will cause
documents
+ without the field to come after documents with the field,
+ regardless of the requested sort order (asc or desc).
+ - If sortMissingFirst="true", then a sort on this field will cause
documents
+ without the field to come before documents with the field,
+ regardless of the requested sort order.
+ - If sortMissingLast="false" and sortMissingFirst="false" (the default),
+ then default lucene sorting will be used which places docs without the
+ field first in an ascending sort and last in a descending sort.
+ -->
+
</types>
<fields>
@@ -318,6 +339,41 @@
<field name="cache" type="string" stored="true" indexed="false"/>
<field name="tstamp" type="date" stored="true" indexed="false"/>
+ <!-- fields for index-geoip plugin -->
+ <field name="ip" type="string" stored="true" indexed="true" />
+ <field name="cityName" type="string" stored="true" indexed="true" />
+ <field name="cityConfidence" type="int" stored="true" indexed="true" />
+ <field name="cityGeoNameId" type="int" stored="true" indexed="true" />
+ <field name="continentCode" type="string" stored="true" indexed="true" />
+ <field name="continentGeoNameId" type="int" stored="true" indexed="true" />
+ <field name="contentName" type="string" stored="true" indexed="true" />
+ <field name="countryIsoCode" type="string" stored="true" indexed="true"/>
+ <field name="countryName" type="string" stored="true" indexed="true" />
+ <field name="countryConfidence" type="int" stored="true" indexed="true"/>
+ <field name="countryGeoNameId" type="int" stored="true" indexed="true"/>
+ <field name="latLon" type="string" stored="true" indexed="true"/>
+ <field name="accRadius" type="int" stored="true" indexed="true"/>
+ <field name="timeZone" type="string" stored="true" indexed="true"/>
+ <field name="metroCode" type="int" stored="true" indexed="true" />
+ <field name="postalCode" type="string" stored="true" indexed="true" />
+ <field name="postalConfidence" type="int" stored="true" indexed="true" />
+ <field name="countryType" type="string" stored="true" indexed="true" />
+ <field name="subDivName" type="string" stored="true" indexed="true" />
+ <field name="subDivIsoCode" type="string" stored="true" indexed="true" />
+ <field name="subDivConfidence" type="int" stored="true" indexed="true" />
+ <field name="subDivGeoNameId" type="int" stored="true" indexed="true" />
+ <field name="autonSystemNum" type="int" stored="true" indexed="true" />
+ <field name="autonSystemOrg" type="string" stored="true" indexed="true" />
+ <field name="domain" type="string" stored="true" indexed="true" />
+ <field name="isp" type="string" stored="true" indexed="true" />
+ <field name="org" type="string" stored="true" indexed="true" />
+ <field name="userType" type="string" stored="true" indexed="true" />
+ <field name="isAnonProxy" type="boolean" stored="true" indexed="true" />
+ <field name="isSatelitteProv" type="boolean" stored="true" indexed="true"
/>
+ <field name="connType" type="string" stored="true" indexed="true" />
+
+ <dynamicField name="*_coordinate" type="tdouble" indexed="true"
stored="false"/>
+
<!-- catch-all field -->
<field name="text" type="text_general" stored="false" indexed="true"
multiValued="true"/>
@@ -363,5 +419,5 @@
<copyField source="title" dest="text"/>
<copyField source="anchor" dest="text"/>
<copyField source="author" dest="text"/>
-
+ <copyField source="latLon" dest="location"/>
</schema>
Modified: nutch/trunk/conf/schema.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/schema.xml?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/conf/schema.xml (original)
+++ nutch/trunk/conf/schema.xml Sat Jan 10 23:24:58 2015
@@ -38,6 +38,7 @@
omitNorms="true" positionIncrementGap="0"/>
<fieldType name="date" class="solr.TrieDateField" precisionStep="0"
omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="location" class="solr.LatLonType"
subFieldSuffix="_coordinate"/>
<fieldType name="text" class="solr.TextField"
positionIncrementGap="100">
@@ -64,6 +65,24 @@
generateWordParts="1" generateNumberParts="1"/>
</analyzer>
</fieldType>
+ <!-- boolean type: "true" or "false" -->
+ <fieldType name="boolean" class="solr.BoolField"
sortMissingLast="true"/>
+
+ <!-- sortMissingLast and sortMissingFirst attributes are optional
attributes are
+ currently supported on types that are sorted internally as strings
+ and on numeric types.
+ This includes "string","boolean", and, as of 3.5 (and 4.x),
+ int, float, long, date, double, including the "Trie" variants.
+ - If sortMissingLast="true", then a sort on this field will cause
documents
+ without the field to come after documents with the field,
+ regardless of the requested sort order (asc or desc).
+ - If sortMissingFirst="true", then a sort on this field will cause
documents
+ without the field to come before documents with the field,
+ regardless of the requested sort order.
+ - If sortMissingLast="false" and sortMissingFirst="false" (the default),
+ then default lucene sorting will be used which places docs without the
+ field first in an ascending sort and last in a descending sort.
+ -->
</types>
<fields>
<field name="id" type="string" stored="true" indexed="true"
@@ -83,6 +102,43 @@
<field name="title" type="text" stored="true" indexed="true"/>
<field name="cache" type="string" stored="true" indexed="false"/>
<field name="tstamp" type="date" stored="true" indexed="false"/>
+
+ <!-- fields for index-geoip plugin -->
+ <field name="ip" type="string" stored="true" indexed="true" />
+ <field name="cityName" type="string" stored="true" indexed="true" />
+ <field name="cityConfidence" type="int" stored="true" indexed="true" />
+ <field name="cityGeoNameId" type="int" stored="true" indexed="true" />
+ <field name="continentCode" type="string" stored="true" indexed="true"
/>
+ <field name="continentGeoNameId" type="int" stored="true"
indexed="true" />
+ <field name="contentName" type="string" stored="true" indexed="true" />
+ <field name="countryIsoCode" type="string" stored="true"
indexed="true"/>
+ <field name="countryName" type="string" stored="true" indexed="true" />
+ <field name="countryConfidence" type="int" stored="true"
indexed="true"/>
+ <field name="countryGeoNameId" type="int" stored="true"
indexed="true"/>
+ <field name="latLon" type="string" stored="true" indexed="true"/>
+ <field name="accRadius" type="int" stored="true" indexed="true"/>
+ <field name="timeZone" type="string" stored="true" indexed="true"/>
+ <field name="metroCode" type="int" stored="true" indexed="true" />
+ <field name="postalCode" type="string" stored="true" indexed="true" />
+ <field name="postalConfidence" type="int" stored="true" indexed="true"
/>
+ <field name="countryType" type="string" stored="true" indexed="true" />
+ <field name="subDivName" type="string" stored="true" indexed="true" />
+ <field name="subDivIsoCode" type="string" stored="true" indexed="true"
/>
+ <field name="subDivConfidence" type="int" stored="true" indexed="true"
/>
+ <field name="subDivGeoNameId" type="int" stored="true" indexed="true"
/>
+ <field name="autonSystemNum" type="int" stored="true" indexed="true" />
+ <field name="autonSystemOrg" type="string" stored="true"
indexed="true" />
+ <field name="domain" type="string" stored="true" indexed="true" />
+ <field name="isp" type="string" stored="true" indexed="true" />
+ <field name="org" type="string" stored="true" indexed="true" />
+ <field name="userType" type="string" stored="true" indexed="true" />
+ <field name="isAnonProxy" type="boolean" stored="true" indexed="true"
/>
+ <field name="isSatelitteProv" type="boolean" stored="true"
indexed="true" />
+ <field name="connType" type="string" stored="true" indexed="true" />
+
+
+
+ <dynamicField name="*_coordinate" type="tdouble" indexed="true"
stored="false"/>
<!-- fields for index-anchor plugin -->
<field name="anchor" type="string" stored="true" indexed="true"
@@ -137,5 +193,6 @@
<copyField source="title" dest="text"/>
<copyField source="anchor" dest="text"/>
<copyField source="author" dest="text"/>
+ <copyField source="latLon" dest="location"/>
</schema>
Modified: nutch/trunk/conf/solrindex-mapping.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/solrindex-mapping.xml?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/conf/solrindex-mapping.xml (original)
+++ nutch/trunk/conf/solrindex-mapping.xml Sat Jan 10 23:24:58 2015
@@ -17,8 +17,8 @@
-->
<mapping>
- <!-- Simple mapping of fields created by Nutch IndexingFilters
- to fields defined (and expected) in Solr schema.xml.
+ <!-- Simple mapping of fields created by Nutch IndexingFilters
+ to fields defined (and expected) in Solr schema.xml.
Any fields in NutchDocument that match a name defined
in field/@source will be renamed to the corresponding
@@ -30,14 +30,14 @@
uniqueKey has the same meaning as in Solr schema.xml
and defaults to "id" if not defined.
-->
- <fields>
- <field dest="content" source="content"/>
- <field dest="title" source="title"/>
- <field dest="host" source="host"/>
- <field dest="segment" source="segment"/>
- <field dest="boost" source="boost"/>
- <field dest="digest" source="digest"/>
- <field dest="tstamp" source="tstamp"/>
- </fields>
- <uniqueKey>id</uniqueKey>
+ <fields>
+ <field dest="content" source="content"/>
+ <field dest="title" source="title"/>
+ <field dest="host" source="host"/>
+ <field dest="segment" source="segment"/>
+ <field dest="boost" source="boost"/>
+ <field dest="digest" source="digest"/>
+ <field dest="tstamp" source="tstamp"/>
+ </fields>
+ <uniqueKey>id</uniqueKey>
</mapping>
Modified: nutch/trunk/default.properties
URL:
http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/default.properties (original)
+++ nutch/trunk/default.properties Sat Jan 10 23:24:58 2015
@@ -148,6 +148,7 @@ plugins.index=\
org.apache.nutch.indexer.anchor*:\
org.apache.nutch.indexer.basic*:\
org.apache.nutch.indexer.feed*:\
+ org.apache.nutch.indexer.geoip*:\
org.apache.nutch.indexer.metadata*:\
org.apache.nutch.indexer.more*:\
org.apache.nutch.indexer.static*:\
Modified:
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Sat Jan 10 23:24:58 2015
@@ -17,7 +17,6 @@
package org.apache.nutch.indexer;
-import java.util.Iterator;
import java.util.List;
import java.util.Map;
Modified: nutch/trunk/src/plugin/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Sat Jan 10 23:24:58 2015
@@ -31,6 +31,7 @@
<ant dir="headings" target="deploy"/>
<ant dir="index-basic" target="deploy"/>
<ant dir="index-anchor" target="deploy"/>
+ <ant dir="index-geoip" target="deploy"/>
<ant dir="index-more" target="deploy"/>
<ant dir="index-static" target="deploy"/>
<ant dir="index-metadata" target="deploy"/>
@@ -83,6 +84,7 @@
<ant dir="creativecommons" target="test"/>
<ant dir="index-basic" target="test"/>
<ant dir="index-anchor" target="test"/>
+ <ant dir="index-geoip" target="test"/>
<ant dir="index-more" target="test"/>
<ant dir="index-static" target="test"/>
<ant dir="language-identifier" target="test"/>
@@ -122,6 +124,7 @@
<ant dir="headings" target="clean"/>
<ant dir="index-basic" target="clean"/>
<ant dir="index-anchor" target="clean"/>
+ <ant dir="index-geoip" target="clean"/>
<ant dir="index-more" target="clean"/>
<ant dir="index-static" target="clean"/>
<ant dir="index-metadata" target="clean"/>
Added: nutch/trunk/src/plugin/index-geoip/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/build.xml?rev=1650829&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-geoip/build.xml (added)
+++ nutch/trunk/src/plugin/index-geoip/build.xml Sat Jan 10 23:24:58 2015
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-geoip" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+ <target name="init-plugin">
+ <echo>Copying MaxMind GeoIP .mmdb files to build</echo>
+ <copy todir="${build.classes}">
+ <fileset dir="${src.dir}" includes="**/*.mmdb" />
+ </copy>
+ </target>
+</project>
Added: nutch/trunk/src/plugin/index-geoip/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/ivy.xml?rev=1650829&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-geoip/ivy.xml (added)
+++ nutch/trunk/src/plugin/index-geoip/ivy.xml Sat Jan 10 23:24:58 2015
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.1.0" />
+ </dependencies>
+
+</ivy-module>
Added: nutch/trunk/src/plugin/index-geoip/plugin.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/plugin.xml?rev=1650829&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-geoip/plugin.xml (added)
+++ nutch/trunk/src/plugin/index-geoip/plugin.xml Sat Jan 10 23:24:58 2015
@@ -0,0 +1,43 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="index-geoip"
+ name="GeoIP2 Indexing Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+
+ <runtime>
+ <library name="index-geoip.jar">
+ <export name="*"/>
+ </library>
+ <library name="geoip2-2.1.0.jar"/>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.indexer.geoip"
+ name="Nutch GeoIP2 Indexing Filter"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="GeoIPIndexingFilter"
+
class="org.apache.nutch.indexer.geoip.GeoIPIndexingFilter"/>
+ </extension>
+
+</plugin>
+
Added:
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java?rev=1650829&view=auto
==============================================================================
---
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
(added)
+++
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
Sat Jan 10 23:24:58 2015
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.geoip;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+import org.apache.nutch.indexer.NutchDocument;
+
+import com.maxmind.geoip2.DatabaseReader;
+import com.maxmind.geoip2.WebServiceClient;
+import com.maxmind.geoip2.exception.GeoIp2Exception;
+import com.maxmind.geoip2.model.InsightsResponse;
+import com.maxmind.geoip2.model.CityResponse;
+import com.maxmind.geoip2.model.ConnectionTypeResponse;
+import com.maxmind.geoip2.model.CountryResponse;
+import com.maxmind.geoip2.model.DomainResponse;
+import com.maxmind.geoip2.model.IspResponse;
+import com.maxmind.geoip2.record.City;
+import com.maxmind.geoip2.record.Continent;
+import com.maxmind.geoip2.record.Country;
+import com.maxmind.geoip2.record.Location;
+import com.maxmind.geoip2.record.Postal;
+import com.maxmind.geoip2.record.RepresentedCountry;
+import com.maxmind.geoip2.record.Subdivision;
+import com.maxmind.geoip2.record.Traits;
+
+/**
+ * <p>Simple utility class which enables efficient, structured
+ * {@link org.apache.nutch.indexer.NutchDocument} building based on input
+ * from {@link GeoIPIndexingFilter}, where configuration is also read.</p>
+ * <p>Based on the nature of the input, this class wraps factory type
+ * implementations for populating {@link
org.apache.nutch.indexer.NutchDocument}'s
+ * with the correct {@link org.apache.nutch.indexer.NutchField} information.
+ *
+ */
+public class GeoIPDocumentCreator {
+
+ /**
+ * Default constructor.
+ */
+ public GeoIPDocumentCreator() {
+ }
+
+ public static NutchDocument createDocFromInsightsService(String serverIp,
+ NutchDocument doc, WebServiceClient client) throws UnknownHostException,
IOException, GeoIp2Exception {
+ doc.add("ip", serverIp);
+ InsightsResponse response =
client.insights(InetAddress.getByName(serverIp));
+ //CityResponse response = client.city(InetAddress.getByName(serverIp));
+
+ City city = response.getCity();
+ doc.add("cityName", city.getName()); // 'Minneapolis'
+ doc.add("cityConfidence", city.getConfidence()); // 50
+ doc.add("cityGeoNameId", city.getGeoNameId());
+
+ Continent continent = response.getContinent();
+ doc.add("continentCode", continent.getCode());
+ doc.add("continentGeoNameId", continent.getGeoNameId());
+ doc.add("continentName", continent.getName());
+
+ Country country = response.getCountry();
+ doc.add("countryIsoCode", country.getIsoCode()); // 'US'
+ doc.add("countryName", country.getName()); // 'United States'
+ doc.add("countryConfidence", country.getConfidence()); // 99
+ doc.add("countryGeoName", country.getGeoNameId());
+
+ Location location = response.getLocation();
+ doc.add("latLon", location.getLatitude() + "," + location.getLongitude());
// 44.9733, -93.2323
+ doc.add("accRadius", location.getAccuracyRadius()); // 3
+ doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
+ doc.add("metroCode", location.getMetroCode());
+
+ Postal postal = response.getPostal();
+ doc.add("postalCode", postal.getCode()); // '55455'
+ doc.add("postalConfidence", postal.getConfidence()); // 40
+
+ RepresentedCountry rCountry = response.getRepresentedCountry();
+ doc.add("countryType", rCountry.getType());
+
+ Subdivision subdivision = response.getMostSpecificSubdivision();
+ doc.add("subDivName", subdivision.getName()); // 'Minnesota'
+ doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+ doc.add("subDivConfidence", subdivision.getConfidence()); // 90
+ doc.add("subDivGeoNameId", subdivision.getGeoNameId());
+
+ Traits traits = response.getTraits();
+ doc.add("autonSystemNum", traits.getAutonomousSystemNumber());
+ doc.add("autonSystemOrg", traits.getAutonomousSystemOrganization());
+ doc.add("domain", traits.getDomain());
+ doc.add("isp", traits.getIsp());
+ doc.add("org", traits.getOrganization());
+ doc.add("userType", traits.getUserType());
+ doc.add("isAnonProxy", traits.isAnonymousProxy());
+ doc.add("isSatelliteProv", traits.isSatelliteProvider());
+ return doc;
+ }
+
+ @SuppressWarnings("unused")
+ public static NutchDocument createDocFromCityService(String serverIp,
+ NutchDocument doc, WebServiceClient client) throws UnknownHostException,
IOException, GeoIp2Exception {
+ CityResponse response = client.city(InetAddress.getByName(serverIp));
+ return doc;
+ }
+
+ @SuppressWarnings("unused")
+ public static NutchDocument createDocFromCountryService(String serverIp,
+ NutchDocument doc, WebServiceClient client) throws UnknownHostException,
IOException, GeoIp2Exception {
+ CountryResponse response =
client.country(InetAddress.getByName(serverIp));
+ return doc;
+ }
+
+ public static NutchDocument createDocFromIspDb(String serverIp,
NutchDocument doc,
+ DatabaseReader reader) throws UnknownHostException, IOException,
GeoIp2Exception {
+ IspResponse response = reader.isp(InetAddress.getByName(serverIp));
+ doc.add("ip", serverIp);
+ doc.add("autonSystemNum", response.getAutonomousSystemNumber());
+ doc.add("autonSystemOrg", response.getAutonomousSystemOrganization());
+ doc.add("isp", response.getIsp());
+ doc.add("org", response.getOrganization());
+ return doc;
+ }
+
+ public static NutchDocument createDocFromDomainDb(String serverIp,
NutchDocument doc,
+ DatabaseReader reader) throws UnknownHostException, IOException,
GeoIp2Exception {
+ DomainResponse response = reader.domain(InetAddress.getByName(serverIp));
+ doc.add("ip", serverIp);
+ doc.add("domain", response.getDomain());
+ return doc;
+ }
+
+ public static NutchDocument createDocFromConnectionDb(String serverIp,
+ NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
IOException, GeoIp2Exception {
+ ConnectionTypeResponse response =
reader.connectionType(InetAddress.getByName(serverIp));
+ doc.add("ip", serverIp);
+ doc.add("connType", response.getConnectionType().toString());
+ return doc;
+ }
+
+ public static NutchDocument createDocFromCityDb(String serverIp,
NutchDocument doc,
+ DatabaseReader reader) throws UnknownHostException, IOException,
GeoIp2Exception {
+ doc.add("ip", serverIp);
+ CityResponse response = reader.city(InetAddress.getByName(serverIp));
+
+ City city = response.getCity();
+ doc.add("cityName", city.getName()); // 'Minneapolis'
+ doc.add("cityConfidence", city.getConfidence()); // 50
+ doc.add("cityGeoNameId", city.getGeoNameId());
+
+ Continent continent = response.getContinent();
+ doc.add("continentCode", continent.getCode());
+ doc.add("continentGeoNameId", continent.getGeoNameId());
+ doc.add("continentName", continent.getName());
+
+ Country country = response.getCountry();
+ doc.add("countryIsoCode", country.getIsoCode()); // 'US'
+ doc.add("countryName", country.getName()); // 'United States'
+ doc.add("countryConfidence", country.getConfidence()); // 99
+ doc.add("countryGeoName", country.getGeoNameId());
+
+ Location location = response.getLocation();
+ doc.add("latLon", location.getLatitude() + "," + location.getLongitude());
// 44.9733, -93.2323
+ doc.add("accRadius", location.getAccuracyRadius()); // 3
+ doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
+ doc.add("metroCode", location.getMetroCode());
+
+ Postal postal = response.getPostal();
+ doc.add("postalCode", postal.getCode()); // '55455'
+ doc.add("postalConfidence", postal.getConfidence()); // 40
+
+ RepresentedCountry rCountry = response.getRepresentedCountry();
+ doc.add("countryType", rCountry.getType());
+
+ Subdivision subdivision = response.getMostSpecificSubdivision();
+ doc.add("subDivName", subdivision.getName()); // 'Minnesota'
+ doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+ doc.add("subDivConfidence", subdivision.getConfidence()); // 90
+ doc.add("subDivGeoNameId", subdivision.getGeoNameId());
+ return doc;
+ }
+
+}
Added:
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java?rev=1650829&view=auto
==============================================================================
---
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
(added)
+++
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
Sat Jan 10 23:24:58 2015
@@ -0,0 +1,218 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.geoip;
+
+import java.io.File;
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.maxmind.geoip2.DatabaseReader;
+import com.maxmind.geoip2.WebServiceClient;
+
+/**
+ * <p>This plugin implements an indexing filter which takes
+ * advantage of the
+ * <a href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.</p>
+ * <p>The third party library distribution provides an API for the GeoIP2
+ * <a href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web
services</a>
+ * and <a
href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>.
+ * The API also works with the free
+ * <a href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2
databases</a>.</p>
+ * <p>Depending on the service level agreement, you have with the GeoIP
service provider,
+ * the plugin can add a number of the following fields to the index data model:
+ * <ol>
+ * <li>Continent</li>
+ * <li>Country</li>
+ * <li>Regional Subdivision</li>
+ * <li>City</li>
+ * <li>Postal Code</li>
+ * <li>Latitude/Longitude</li>
+ * <li>ISP/Organization</li>
+ * <li>AS Number</li>
+ * <li>Confidence Factors</li>
+ * <li>Radius</li>
+ * <li>User Type</li>
+ * </ol></p>
+ *
+ * <p>Some of the services are documented at the
+ * <a href="https://www.maxmind.com/en/geoip2-precision-services">GeoIP2
Precision Services</a>
+ * webpage where more information can be obtained.</p>
+ *
+ * <p>You should also consult the following three properties in
<code>nutch-site.xml</code></p>
+ * <pre>
+ * {@code
+ *<!-- index-geoip plugin properties -->
+<property>
+ <name>index.geoip.usage</name>
+ <value>insightsService</value>
+ <description>
+ A string representing the information source to be used for GeoIP information
+ association. Either enter 'cityDatabase', 'connectionTypeDatabase',
+ 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any
one of the
+ Database options, you should make one of GeoIP2-City.mmdb,
GeoIP2-Connection-Type.mmdb,
+ GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the
Hadoop classpath
+ and available at runtime. This can be achieved by adding it to
$NUTCH_HOME/conf
+ </description>
+</property>
+
+<property>
+ <name>index.geoip.userid</name>
+ <value></value>
+ <description>
+ The userId associated with the GeoIP2 Precision Services account.
+ </description>
+</property>
+
+<property>
+ <name>index.geoip.licensekey</name>
+ <value></value>
+ <description>
+ The license key associated with the GeoIP2 Precision Services account.
+ </description>
+</property>
+}
+ * </pre>
+ *
+ */
+public class GeoIPIndexingFilter implements IndexingFilter {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(GeoIPIndexingFilter.class);
+
+ private Configuration conf;
+
+ private String usage = null;
+
+ private File geoDb = null;
+
+ WebServiceClient client = null;
+
+ DatabaseReader reader = null;
+
+ //private AbstractResponse response = null;
+
+ /**
+ * Default constructor for this plugin
+ */
+ public GeoIPIndexingFilter() {
+ }
+
+ /**
+ * @see org.apache.hadoop.conf.Configurable#getConf()
+ */
+ @Override
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /**
+ * @see
org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration)
+ */
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ String use = conf.get("index.geoip.usage", "insightsService");
+ LOG.debug("GeoIP usage medium set to: {}", use);
+ if (use.equalsIgnoreCase("cityDatabase")) {
+ try {
+ geoDb = new File(conf.getResource("GeoIP2-City.mmdb").getFile());
+ buildDb();
+ } catch (Exception e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ }
+ } else if (use.equalsIgnoreCase("connectionTypeDatabase")) {
+ try {
+ geoDb = new
File(conf.getResource("GeoIP2-Connection-Type.mmdb").getFile());
+ buildDb();
+ } catch (Exception e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ }
+ } else if (use.equalsIgnoreCase("domainDatabase")) {
+ try {
+ geoDb = new File(conf.getResource("GeoIP2-Domain.mmdb").getFile());
+ buildDb();
+ } catch (Exception e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ }
+ } else if (use.equalsIgnoreCase("ispDatabase")) {
+ try {
+ geoDb = new File(conf.getResource("GeoIP2-ISP.mmdb").getFile());
+ buildDb();
+ } catch (Exception e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ }
+ } else if (use.equalsIgnoreCase("insightsService")) {
+ client = new WebServiceClient.Builder(
+ conf.getInt("index.geoip.userid", 12345),
conf.get("index.geoip.licensekey")).build();
+ }
+ usage = use;
+ }
+
+ private void buildDb() {
+ try {
+ reader = new DatabaseReader.Builder(geoDb).build();
+ } catch (IOException e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ }
+ }
+
+ /**
+ *
+ * @see
org.apache.nutch.indexer.IndexingFilter#filter(org.apache.nutch.indexer.NutchDocument,
org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text,
org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks)
+ */
+ @Override
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+ return addServerGeo(doc, parse.getData(), url.toString());
+ }
+
+ private NutchDocument addServerGeo(NutchDocument doc, ParseData data, String
url) {
+
+ if (conf.getBoolean("store.ip.address", false) == true) {
+ try {
+ String serverIp = data.getContentMeta().get("_ip_");
+ if (serverIp != null) {
+ if (usage.equalsIgnoreCase("cityDatabase")) {
+ doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc,
reader);
+ } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
+ doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp,
doc, reader);
+ } else if (usage.equalsIgnoreCase("domainDatabase")) {
+ doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc,
reader);
+ } else if (usage.equalsIgnoreCase("ispDatabase")) {
+ doc = GeoIPDocumentCreator.createDocFromIspDb(serverIp, doc,
reader);
+ } else if (usage.equalsIgnoreCase("insightsService")) {
+ doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp,
doc, client);
+ }
+ }
+ } catch (Exception e) {
+ LOG.error(e.getMessage());
+ e.printStackTrace();
+ }
+ }
+ return doc;
+ }
+
+}
Added:
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java?rev=1650829&view=auto
==============================================================================
---
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
(added)
+++
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
Sat Jan 10 23:24:58 2015
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * <p>This plugin implements an indexing filter which takes
+ * advantage of the
+ * <a href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.</p>
+ * <p>The third party library distribution provides an API for the GeoIP2
+ * <a href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web
services</a>
+ * and <a
href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>.
+ * The API also works with the free
+ * <a href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2
databases</a>.
+ *
+ */
+package org.apache.nutch.indexer.geoip;
\ No newline at end of file