This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 8fc4f17ac NUTCH-2956 index-geoip: dependency upgrades and improvements - upgrade to geoip2 3.0.1 - exclude transitive dependencies (Jackson) provided as Nutch core deps - read also GeoLite2-*.mmdb files - review index field names in plugin and Nutch Solr schema: - fix typos in field names - remove unused fields from schema 8fc4f17ac is described below commit 8fc4f17acc5da28c22ef4e77c2316e20e5976f02 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Sat Aug 6 15:04:10 2022 +0200 NUTCH-2956 index-geoip: dependency upgrades and improvements - upgrade to geoip2 3.0.1 - exclude transitive dependencies (Jackson) provided as Nutch core deps - read also GeoLite2-*.mmdb files - review index field names in plugin and Nutch Solr schema: - fix typos in field names - remove unused fields from schema --- conf/nutch-default.xml | 3 +- src/plugin/index-geoip/ivy.xml | 11 +++-- src/plugin/index-geoip/plugin.xml | 7 +--- .../nutch/indexer/geoip/GeoIPDocumentCreator.java | 49 ++++++++++++---------- .../nutch/indexer/geoip/GeoIPIndexingFilter.java | 34 ++++++++------- src/plugin/indexer-solr/schema.xml | 3 +- 6 files changed, 57 insertions(+), 50 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 7faa6fdcd..bb9aae1b3 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -2112,7 +2112,8 @@ Add scoring-metadata to the list of active plugins 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the classpath and - available at runtime. + available at runtime. Alternatively, also the GeoLite2 IP databases (GeoLite2-*.mmdb) + can be used. </description> </property> diff --git a/src/plugin/index-geoip/ivy.xml b/src/plugin/index-geoip/ivy.xml index 4fa6f71a7..2eda5a63f 100644 --- a/src/plugin/index-geoip/ivy.xml +++ b/src/plugin/index-geoip/ivy.xml @@ -36,12 +36,11 @@ </publications> <dependencies> - <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.12.0" > - <!-- Exlude due to classpath issues --> - <exclude org="org.apache.httpcomponents" name="httpclient" /> - <exclude org="org.apache.httpcomponents" name="httpcore" /> - <exclude org="commons-codec" name="commons-codec" /> - <exclude org="commons-logging" name="commons-logging" /> + <dependency org="com.maxmind.geoip2" name="geoip2" rev="3.0.1"> + <!-- Exlude libs provided in Nutch core --> + <exclude org="com.fasterxml.jackson.core" name="jackson-annotations" /> + <exclude org="com.fasterxml.jackson.core" name="jackson-databind" /> + <exclude org="com.fasterxml.jackson.core" name="jackson-core" /> </dependency> </dependencies> diff --git a/src/plugin/index-geoip/plugin.xml b/src/plugin/index-geoip/plugin.xml index 6148f59e5..c4efadf94 100644 --- a/src/plugin/index-geoip/plugin.xml +++ b/src/plugin/index-geoip/plugin.xml @@ -25,11 +25,8 @@ <library name="index-geoip.jar"> <export name="*"/> </library> - <library name="geoip2-2.12.0.jar"/> - <library name="jackson-annotations-2.9.5.jar"/> - <library name="jackson-core-2.9.5.jar"/> - <library name="jackson-databind-2.9.5.jar"/> - <library name="maxmind-db-1.2.2.jar"/> + <library name="geoip2-3.0.1.jar"/> + <library name="maxmind-db-2.0.0.jar"/> </runtime> <requires> diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java index 1c697a205..64b3862be 100644 --- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java +++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java @@ -17,13 +17,17 @@ package org.apache.nutch.indexer.geoip; import java.io.IOException; +import java.lang.invoke.MethodHandles; import java.net.InetAddress; import java.net.UnknownHostException; import org.apache.nutch.indexer.NutchDocument; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.maxmind.geoip2.DatabaseReader; import com.maxmind.geoip2.WebServiceClient; +import com.maxmind.geoip2.exception.AddressNotFoundException; import com.maxmind.geoip2.exception.GeoIp2Exception; import com.maxmind.geoip2.model.InsightsResponse; import com.maxmind.geoip2.model.CityResponse; @@ -54,28 +58,17 @@ import com.maxmind.geoip2.record.Traits; */ public class GeoIPDocumentCreator { - /** - * Add field to document but only if value isn't null - * @param doc the {@link NutchDocument} to augment - * @param name the name of the target field - * @param value the String value to associate with the target field - */ - public static void addIfNotNull(NutchDocument doc, String name, - String value) { - if (value != null) { - doc.add(name, value); - } - } + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); /** * Add field to document but only if value isn't null * @param doc the {@link NutchDocument} to augment * @param name the name of the target field - * @param value the {@link java.lang.Integer} value to - * associate with the target field + * @param value the String value to associate with the target field */ public static void addIfNotNull(NutchDocument doc, String name, - Integer value) { + Object value) { if (value != null) { doc.add(name, value); } @@ -87,7 +80,6 @@ public class GeoIPDocumentCreator { addIfNotNull(doc, "ip", serverIp); InsightsResponse response = client .insights(InetAddress.getByName(serverIp)); - // CityResponse response = client.city(InetAddress.getByName(serverIp)); City city = response.getCity(); addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis' @@ -103,7 +95,7 @@ public class GeoIPDocumentCreator { addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US' addIfNotNull(doc, "countryName", country.getName()); // 'United States' addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99 - addIfNotNull(doc, "countryGeoName", country.getGeoNameId()); + addIfNotNull(doc, "countryGeoNameId", country.getGeoNameId()); Location location = response.getLocation(); addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733, @@ -121,7 +113,7 @@ public class GeoIPDocumentCreator { Subdivision subdivision = response.getMostSpecificSubdivision(); addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota' - addIfNotNull(doc, "subDivIdoCode", subdivision.getIsoCode()); // 'MN' + addIfNotNull(doc, "subDivIsoCode", subdivision.getIsoCode()); // 'MN' addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90 addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId()); @@ -169,7 +161,13 @@ public class GeoIPDocumentCreator { public static NutchDocument createDocFromDomainDb(String serverIp, NutchDocument doc, DatabaseReader reader) throws UnknownHostException, IOException, GeoIp2Exception { - DomainResponse response = reader.domain(InetAddress.getByName(serverIp)); + DomainResponse response; + try { + response = reader.domain(InetAddress.getByName(serverIp)); + } catch (AddressNotFoundException e) { + LOG.debug("IP address not found: {}", serverIp); + return doc; + } addIfNotNull(doc, "ip", serverIp); addIfNotNull(doc, "domain", response.getDomain()); return doc; @@ -189,7 +187,14 @@ public class GeoIPDocumentCreator { NutchDocument doc, DatabaseReader reader) throws UnknownHostException, IOException, GeoIp2Exception { addIfNotNull(doc, "ip", serverIp); - CityResponse response = reader.city(InetAddress.getByName(serverIp)); + + CityResponse response; + try { + response = reader.city(InetAddress.getByName(serverIp)); + } catch (AddressNotFoundException e) { + LOG.debug("IP address not found: {}", serverIp); + return doc; + } City city = response.getCity(); addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis' @@ -206,7 +211,7 @@ public class GeoIPDocumentCreator { addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US' addIfNotNull(doc, "countryName", country.getName()); // 'United States' addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99 - addIfNotNull(doc, "countryGeoName", country.getGeoNameId()); + addIfNotNull(doc, "countryGeoNameId", country.getGeoNameId()); Location location = response.getLocation(); addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733, @@ -224,7 +229,7 @@ public class GeoIPDocumentCreator { Subdivision subdivision = response.getMostSpecificSubdivision(); addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota' - addIfNotNull(doc, "subDivIdoCode", subdivision.getIsoCode()); // 'MN' + addIfNotNull(doc, "subDivIsoCode", subdivision.getIsoCode()); // 'MN' addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90 addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId()); return doc; diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java index 4e2127365..ea30b8c7b 100644 --- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java +++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java @@ -87,7 +87,8 @@ import com.maxmind.geoip2.WebServiceClient; * 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the * Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, * GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath - * and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf + * and available at runtime. This can be achieved by adding it to `$NUTCH_HOME/conf`. + * Alternatively, also the GeoLite2 IP databases (GeoLite2-*.mmdb) can be used. * </description> * </property> * @@ -152,24 +153,29 @@ public class GeoIPIndexingFilter implements IndexingFilter { conf.getInt("index.geoip.userid", 12345), conf.get("index.geoip.licensekey")).build(); } else { - String db = null; + String dbSuffix = null; if (usage.equalsIgnoreCase("cityDatabase")) { - db = "GeoIP2-City.mmdb"; + dbSuffix = "-City.mmdb"; } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) { - db = "GeoIP2-Connection-Type.mmdb"; + dbSuffix = "-Connection-Type.mmdb"; } else if (usage.equalsIgnoreCase("domainDatabase")) { - db = "GeoIP2-Domain.mmdb"; + dbSuffix = "-Domain.mmdb"; } else if (usage.equalsIgnoreCase("ispDatabase")) { - db = "GeoIP2-ISP.mmdb"; + dbSuffix = "-ISP.mmdb"; } - URL dbFileUrl = conf.getResource(db); - if (dbFileUrl == null) { - LOG.error("GeoDb file {} not found on classpath", db); - } else { - try { - buildDb(new File(dbFileUrl.getFile())); - } catch (Exception e) { - LOG.error("Failed to read geoDb file {}: ", db, e); + String[] dbPrefixes = {"GeoIP2", "GeoLite2"}; + for (String dbPrefix : dbPrefixes) { + String db = dbPrefix + dbSuffix; + URL dbFileUrl = conf.getResource(db); + if (dbFileUrl == null) { + LOG.error("GeoDb file {} not found on classpath", db); + } else { + try { + LOG.info("Reading GeoDb file {}", db); + buildDb(new File(dbFileUrl.getFile())); + } catch (Exception e) { + LOG.error("Failed to read geoDb file {}: ", db, e); + } } } } diff --git a/src/plugin/indexer-solr/schema.xml b/src/plugin/indexer-solr/schema.xml index 6865eb02c..ba71fe148 100644 --- a/src/plugin/indexer-solr/schema.xml +++ b/src/plugin/indexer-solr/schema.xml @@ -356,7 +356,7 @@ <field name="cityGeoNameId" type="int" stored="true" indexed="true" /> <field name="continentCode" type="string" stored="true" indexed="true" /> <field name="continentGeoNameId" type="int" stored="true" indexed="true" /> - <field name="contentName" type="string" stored="true" indexed="true" /> + <field name="continentName" type="string" stored="true" indexed="true" /> <field name="countryIsoCode" type="string" stored="true" indexed="true"/> <field name="countryName" type="string" stored="true" indexed="true" /> <field name="countryConfidence" type="int" stored="true" indexed="true"/> @@ -379,7 +379,6 @@ <field name="org" type="string" stored="true" indexed="true" /> <field name="userType" type="string" stored="true" indexed="true" /> <field name="isAnonProxy" type="boolean" stored="true" indexed="true" /> - <field name="isSatelitteProv" type="boolean" stored="true" indexed="true" /> <field name="connType" type="string" stored="true" indexed="true" /> <field name="location" type="location" stored="true" indexed="true" />