Joal has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/403916 )
Change subject: Refactor geo-coding function and add ISP
......................................................................
Refactor geo-coding function and add ISP
Geocoding functions using Maxmind-Country and Maxmind-City
databases were bundled into a single class. Now there are
two classes, one per usage of database, preventing to load
unneeded data. Also, the code now doesn't try to find data
for internal IPs.
The Maxmind-ISP database functions have also been added. They
mimic the behavior of Maxmind-City ones.
Finally, a refresh of internal network IPs have been done in
refinery-core IpUtils.
Bug: T167907
Change-Id: I602b04847d6083d8ba4a4de3c6614d8952d83608
---
M refinery-core/pom.xml
D refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Geocode.java
A
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/GeocodeCity.java
A
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/GeocodeCountry.java
A
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/GeocodeISP.java
M refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/IpUtil.java
R
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestGeocodeCity.java
A
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestGeocodeCountry.java
A
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestGeocodeISP.java
A refinery-core/src/test/resources/GeoIP2-ISP-Test.mmdb
M refinery-hive/pom.xml
M
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedCountryUDF.java
M
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetCountryISOCodeUDF.java
M
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetCountryNameUDF.java
M
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetGeoDataUDF.java
A
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetISPDataUDF.java
A
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetISPDataUDF.java
D refinery-hive/src/test/resources/GeoIP2-City-Test.mmdb
A refinery-hive/src/test/resources/GeoIP2-City-Test.mmdb
D refinery-hive/src/test/resources/GeoIP2-Country-Test.mmdb
A refinery-hive/src/test/resources/GeoIP2-Country-Test.mmdb
A refinery-hive/src/test/resources/GeoIP2-ISP-Test.mmdb
22 files changed, 971 insertions(+), 336 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source
refs/changes/16/403916/1
diff --git a/refinery-core/pom.xml b/refinery-core/pom.xml
index 8ace48a..eaa4fbc 100644
--- a/refinery-core/pom.xml
+++ b/refinery-core/pom.xml
@@ -132,6 +132,7 @@
<systemPropertyVariables>
<maxmind.database.country>${project.build.testOutputDirectory}/GeoIP2-Country-Test.mmdb</maxmind.database.country>
<maxmind.database.city>${project.build.testOutputDirectory}/GeoIP2-City-Test.mmdb</maxmind.database.city>
+
<maxmind.database.isp>${project.build.testOutputDirectory}/GeoIP2-ISP-Test.mmdb</maxmind.database.isp>
</systemPropertyVariables>
<forkCount>1</forkCount>
<includes>
diff --git
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Geocode.java
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Geocode.java
deleted file mode 100644
index af1339a..0000000
---
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Geocode.java
+++ /dev/null
@@ -1,259 +0,0 @@
-/**
- * Copyright (C) 2014 Wikimedia Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.wikimedia.analytics.refinery.core;
-
-import com.maxmind.geoip2.DatabaseReader;
-import com.maxmind.geoip2.exception.GeoIp2Exception;
-import com.maxmind.geoip2.model.CityResponse;
-import com.maxmind.geoip2.model.CountryResponse;
-import com.maxmind.geoip2.record.*;
-import org.apache.log4j.Logger;
-
-import java.io.File;
-import java.io.IOException;
-import java.net.InetAddress;
-import java.net.UnknownHostException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-
-/**
- * Contains functions to find geo information of an IP address using Maxmind's
GeoIP2
- *
- * TODO: Allow usage of this class without always instantiating both city and
country databases.
- */
-public class Geocode {
- // Default paths to Maxmind databases
- public static final String DEFAULT_DATABASE_COUNTRY_PATH =
"/usr/share/GeoIP/GeoIP2-Country.mmdb";
- public static final String DEFAULT_DATABASE_CITY_PATH =
"/usr/share/GeoIP/GeoIP2-City.mmdb";
-
- static final Logger LOG = Logger.getLogger(Geocode.class.getName());
-
- //Constants to hold the keys to use in geo-coded data map
- private static final String CONTINENT = "continent";
- private static final String COUNTRY_CODE = "country_code";
- private static final String COUNTRY = "country";
- private static final String SUBDIVISION = "subdivision";
- private static final String CITY = "city";
- private static final String POSTAL_CODE = "postal_code";
- private static final String LATITUDE = "latitude";
- private static final String LONGITUDE = "longitude";
- private static final String TIMEZONE = "timezone";
-
- private static final String UNKNOWN_COUNTRY_CODE = "--";
- private static final String UNKNOWN_VALUE = "Unknown";
-
- private DatabaseReader countryDatabaseReader;
- private DatabaseReader cityDatabaseReader;
-
-
- /**
- * Constructs a Geocode object with the default Maxmind 2 database paths.
- * You can override either of the default database paths by setting
- * the 'maxmind.database.country' and/or 'maxmind.database.city'
properties.
- */
- public Geocode() throws IOException {
- this(null, null);
- }
-
- /**
- * Constructs a Geocode object with the provided Maxmind 2 database paths.
- * These are 'optional', in that you may set either one to null. If null,
- * the system properties 'maxmind.database.country' and
'maxmind.database.city'
- * will be checked for paths. If these are not set, then this will
default to
- * DEFAULT_DATABASE_PATH_COUNTRY and DEFAULT_DATABASE_PATH_CITY
respectively.
- *
- * @param countryDatabasePath
- * String path to Maxmind's country database
- * @param cityDatabasePath
- * String path to Maxmind's city database
- */
- public Geocode(String countryDatabasePath, String cityDatabasePath) throws
IOException {
- // Override database paths with System properties, if they exist
- if (countryDatabasePath == null) {
- countryDatabasePath =
System.getProperty("maxmind.database.country", DEFAULT_DATABASE_COUNTRY_PATH);
- }
- if (cityDatabasePath == null) {
- cityDatabasePath = System.getProperty("maxmind.database.city",
DEFAULT_DATABASE_CITY_PATH);
- }
-
- LOG.info("Geocode using Maxmind country database: " +
countryDatabasePath);
- LOG.info("Geocode using Maxmind city database: " +
cityDatabasePath);
-
- countryDatabaseReader = new DatabaseReader.Builder(new
File(countryDatabasePath)).build();
- cityDatabaseReader = new DatabaseReader.Builder(new
File(cityDatabasePath)).build();
- }
-
- /**
- * Gets the country code for the given IP
- * @param ip
- * String IP address
- * @return
- * String
- */
- public final String getCountryCode(final String ip) {
- try {
- InetAddress ipAddress = InetAddress.getByName(ip);
- CountryResponse response =
countryDatabaseReader.country(ipAddress);
- Country country = response.getCountry();
- String ret = country.getIsoCode();
- if (ret == null) {
- ret = UNKNOWN_COUNTRY_CODE;
- }
- return ret;
- } catch (UnknownHostException hEx) {
- LOG.warn(hEx);
- return UNKNOWN_COUNTRY_CODE;
- } catch (IOException iEx) {
- LOG.warn(iEx);
- return UNKNOWN_COUNTRY_CODE;
- } catch (GeoIp2Exception gEx) {
- LOG.warn(gEx);
- return UNKNOWN_COUNTRY_CODE;
- }
- }
-
- /**
- * Gets a map with geo-code fields for the given IP
- * @param ip
- * String Ip address
- * @return
- * Map
- */
- public final Map<String, Object> getGeocodedData(final String ip) {
-
- InetAddress ipAddress = null;
- //Initialize map with default values
- Map<String, Object> geoData = getDefaultMap();
-
- try {
- ipAddress = InetAddress.getByName(ip);
- } catch (UnknownHostException hEx) {
- LOG.warn(hEx);
- return geoData;
- }
-
- CityResponse response = null;
- try {
- response = cityDatabaseReader.city(ipAddress);
- } catch (IOException iEx) {
- LOG.warn(iEx);
- return geoData;
- } catch (GeoIp2Exception gEx) {
- LOG.warn(gEx);
- return geoData;
- }
-
- if (response == null)
- return geoData;
-
- Continent continent = response.getContinent();
- if (continent != null) {
- String name = continent.getName();
- if (name != null) {
- geoData.put(CONTINENT, name);
- }
- }
-
- Country country = response.getCountry();
- if (country != null) {
- String name = country.getName();
- String isoCode = country.getIsoCode();
- if (name != null && isoCode != null) {
- geoData.put(COUNTRY, name);
- geoData.put(COUNTRY_CODE, isoCode);
- }
- }
-
- List<Subdivision> subdivisions = response.getSubdivisions();
- if (subdivisions != null && subdivisions.size() > 0) {
- Subdivision subdivision = subdivisions.get(0);
- if (subdivision != null) {
- String name = subdivision.getName();
- if (name != null) {
- geoData.put(SUBDIVISION, name);
- }
- }
- }
-
- City city = response.getCity();
- if (city != null) {
- String name = city.getName();
- if (name != null) {
- geoData.put(CITY, name);
- }
- }
-
- Postal postal = response.getPostal();
- if (postal != null) {
- String code = postal.getCode();
- if (code != null) {
- geoData.put(POSTAL_CODE, code);
- }
- }
-
- Location location = response.getLocation();
- if (location != null) {
- Double lat = location.getLatitude();
- Double lon = location.getLongitude();
- if (lat != null && lon != null) {
- geoData.put(LATITUDE, lat);
- geoData.put(LONGITUDE, lon);
- }
- if (location.getTimeZone() != null)
- geoData.put(TIMEZONE, location.getTimeZone());
- }
-
- return geoData;
- }
-
- /**
- * Creates a new geo data map with default values for all fields
- * @return Map
- */
- private Map<String, Object> getDefaultMap() {
- Map<String, Object> defaultGeoData = new HashMap<String, Object>();
- defaultGeoData.put(CONTINENT, UNKNOWN_VALUE);
- defaultGeoData.put(COUNTRY_CODE, UNKNOWN_COUNTRY_CODE);
- defaultGeoData.put(COUNTRY, UNKNOWN_VALUE);
- defaultGeoData.put(SUBDIVISION, UNKNOWN_VALUE);
- defaultGeoData.put(CITY, UNKNOWN_VALUE);
- defaultGeoData.put(POSTAL_CODE, UNKNOWN_VALUE);
- defaultGeoData.put(LATITUDE, -1);
- defaultGeoData.put(LONGITUDE, -1);
- defaultGeoData.put(TIMEZONE, UNKNOWN_VALUE);
-
- return defaultGeoData;
- }
-
- /**
- * Translate a country code into the country name
- *
- * @param countryCode
- *
- * @return String country name
- */
- public static String getCountryName(String countryCode) {
- if (countryCode == null){
- countryCode = "";
- }
- Locale l = new Locale("", countryCode);
- String displayCountry = l.getDisplayCountry();
- return displayCountry.equalsIgnoreCase(countryCode) ?
UNKNOWN_VALUE : displayCountry;
- }
-}
diff --git
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/GeocodeCity.java
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/GeocodeCity.java
new file mode 100644
index 0000000..7f33ce9
--- /dev/null
+++
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/GeocodeCity.java
@@ -0,0 +1,211 @@
+/**
+ * Copyright (C) 2014 Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wikimedia.analytics.refinery.core;
+
+import com.maxmind.geoip2.DatabaseReader;
+import com.maxmind.geoip2.exception.GeoIp2Exception;
+import com.maxmind.geoip2.model.CityResponse;
+import com.maxmind.geoip2.record.*;
+import org.apache.log4j.Logger;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Contains a function finding geo information of an IP address using
Maxmind's GeoIP2-City
+ */
+public class GeocodeCity {
+ // Default paths to Maxmind City database
+ public static final String DEFAULT_DATABASE_CITY_PATH =
"/usr/share/GeoIP/GeoIP2-City.mmdb";
+
+ static final Logger LOG = Logger.getLogger(GeocodeCity.class.getName());
+
+ // Constants to hold the keys to use in data map
+ private static final String CONTINENT = "continent";
+ private static final String COUNTRY_CODE = "country_code";
+ private static final String COUNTRY = "country";
+ private static final String SUBDIVISION = "subdivision";
+ private static final String CITY = "city";
+ private static final String POSTAL_CODE = "postal_code";
+ private static final String LATITUDE = "latitude";
+ private static final String LONGITUDE = "longitude";
+ private static final String TIMEZONE = "timezone";
+
+ private static final String UNKNOWN_COUNTRY_CODE = "--";
+ private static final String UNKNOWN_VALUE = "Unknown";
+ private static final int UNKNOWN_LAT_LONG = -1;
+
+ private DatabaseReader cityDatabaseReader;
+
+ private final IpUtil ipUtil = new IpUtil();
+
+
+ /**
+ * Constructs a GeocodeCity object with the default Maxmind 2 City
database path.
+ * You can override the default database path by setting the
'maxmind.database.city' property.
+ */
+ public GeocodeCity() throws IOException {
+ this(null);
+ }
+
+ /**
+ * Constructs a GeocodeCity object with the provided Maxmind 2 City
database path.
+ * This path is 'optional', in that you may set it to null. If null, the
system
+ * properties 'maxmind.database.city' will be checked for paths.
+ * If it is not set, then this will default to DEFAULT_DATABASE_CITY_PATH.
+ *
+ * @param cityDatabasePath String path to Maxmind's city database
+ */
+ public GeocodeCity(String cityDatabasePath) throws IOException {
+ // Override database paths with System properties, if they exist
+ if (cityDatabasePath == null) {
+ cityDatabasePath = System.getProperty("maxmind.database.city",
DEFAULT_DATABASE_CITY_PATH);
+ }
+
+ LOG.info("Geocode using Maxmind city database: " +
cityDatabasePath);
+
+ cityDatabaseReader = new DatabaseReader.Builder(new
File(cityDatabasePath)).build();
+ }
+
+ /**
+ * Gets a map with geo-code fields for the given IP
+ * @param ip String Ip address
+ * @return Map the map of geo-code information
+ */
+ public final Map<String, Object> getGeocodedData(final String ip) {
+
+ InetAddress ipAddress;
+ //Initialize map with default values
+ Map<String, Object> geoData = getDefaultMap();
+
+ // Only get geo-code data for non-internal IPs
+ if (ipUtil.getNeworkOrigin(ip) != IpUtil.NetworkOrigin.INTERNET) {
+ return geoData;
+ }
+
+ try {
+ ipAddress = InetAddress.getByName(ip);
+ } catch (UnknownHostException hEx) {
+ LOG.warn(hEx);
+ return geoData;
+ }
+
+ // Only get ISP value for non-internal IPs
+ IpUtil.NetworkOrigin origin = ipUtil.getNeworkOrigin(ip);
+ if (origin != IpUtil.NetworkOrigin.INTERNET) {
+ return geoData;
+ }
+
+ CityResponse response;
+ try {
+ response = cityDatabaseReader.city(ipAddress);
+ } catch (IOException iEx) {
+ LOG.warn(iEx);
+ return geoData;
+ } catch (GeoIp2Exception gEx) {
+ LOG.warn(gEx);
+ return geoData;
+ }
+
+ if (response == null)
+ return geoData;
+
+ Continent continent = response.getContinent();
+ if (continent != null) {
+ String name = continent.getName();
+ if (name != null) {
+ geoData.put(CONTINENT, name);
+ }
+ }
+
+ Country country = response.getCountry();
+ if (country != null) {
+ String name = country.getName();
+ String isoCode = country.getIsoCode();
+ if (name != null && isoCode != null) {
+ geoData.put(COUNTRY, name);
+ geoData.put(COUNTRY_CODE, isoCode);
+ }
+ }
+
+ List<Subdivision> subdivisions = response.getSubdivisions();
+ if (subdivisions != null && subdivisions.size() > 0) {
+ Subdivision subdivision = subdivisions.get(0);
+ if (subdivision != null) {
+ String name = subdivision.getName();
+ if (name != null) {
+ geoData.put(SUBDIVISION, name);
+ }
+ }
+ }
+
+ City city = response.getCity();
+ if (city != null) {
+ String name = city.getName();
+ if (name != null) {
+ geoData.put(CITY, name);
+ }
+ }
+
+ Postal postal = response.getPostal();
+ if (postal != null) {
+ String code = postal.getCode();
+ if (code != null) {
+ geoData.put(POSTAL_CODE, code);
+ }
+ }
+
+ Location location = response.getLocation();
+ if (location != null) {
+ Double lat = location.getLatitude();
+ Double lon = location.getLongitude();
+ if (lat != null && lon != null) {
+ geoData.put(LATITUDE, lat);
+ geoData.put(LONGITUDE, lon);
+ }
+ if (location.getTimeZone() != null)
+ geoData.put(TIMEZONE, location.getTimeZone());
+ }
+
+ return geoData;
+ }
+
+ /**
+ * Creates a new geo data map with default values for all fields
+ * @return Map the map of default geo-code information (unknown)
+ */
+ private Map<String, Object> getDefaultMap() {
+ Map<String, Object> defaultGeoData = new HashMap<>();
+ defaultGeoData.put(CONTINENT, UNKNOWN_VALUE);
+ defaultGeoData.put(COUNTRY_CODE, UNKNOWN_COUNTRY_CODE);
+ defaultGeoData.put(COUNTRY, UNKNOWN_VALUE);
+ defaultGeoData.put(SUBDIVISION, UNKNOWN_VALUE);
+ defaultGeoData.put(CITY, UNKNOWN_VALUE);
+ defaultGeoData.put(POSTAL_CODE, UNKNOWN_VALUE);
+ defaultGeoData.put(LATITUDE, UNKNOWN_LAT_LONG);
+ defaultGeoData.put(LONGITUDE, UNKNOWN_LAT_LONG);
+ defaultGeoData.put(TIMEZONE, UNKNOWN_VALUE);
+
+ return defaultGeoData;
+ }
+
+}
diff --git
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/GeocodeCountry.java
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/GeocodeCountry.java
new file mode 100644
index 0000000..8c2a2e7
--- /dev/null
+++
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/GeocodeCountry.java
@@ -0,0 +1,121 @@
+/**
+ * Copyright (C) 2014 Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wikimedia.analytics.refinery.core;
+
+import com.maxmind.geoip2.DatabaseReader;
+import com.maxmind.geoip2.exception.GeoIp2Exception;
+import com.maxmind.geoip2.model.CountryResponse;
+import com.maxmind.geoip2.record.Country;
+import org.apache.log4j.Logger;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.Locale;
+
+/**
+ * Contains functions to find country information of an IP address using
Maxmind's GeoIP2-Country
+ */
+public class GeocodeCountry {
+ // Default path to Maxmind Country database
+ public static final String DEFAULT_DATABASE_COUNTRY_PATH =
"/usr/share/GeoIP/GeoIP2-Country.mmdb";
+
+ static final Logger LOG = Logger.getLogger(GeocodeCountry.class.getName());
+
+ private static final String UNKNOWN_COUNTRY_CODE = "--";
+ private static final String UNKNOWN_VALUE = "Unknown";
+
+ private DatabaseReader countryDatabaseReader;
+
+ private final IpUtil ipUtil = new IpUtil();
+
+
+ /**
+ * Constructs a GeocodeCountry object with the default Maxmind 2 Country
database path.
+ * You can override the default database path by setting the
'maxmind.database.country' property.
+ */
+ public GeocodeCountry() throws IOException {
+ this(null);
+ }
+
+ /**
+ * Constructs a GeocodeCountry object with the provided Maxmind 2 Country
database path.
+ * This path is 'optional', in that you may set it to null. If null, the
system
+ * properties 'maxmind.database.country' will be checked for paths.
+ * If it is not set, then this will default to
DEFAULT_DATABASE_COUNTRY_PATH.
+ *
+ * @param countryDatabasePath String path to Maxmind's country database
+ */
+ public GeocodeCountry(String countryDatabasePath) throws IOException {
+ // Override database paths with System properties, if they exist
+ if (countryDatabasePath == null) {
+ countryDatabasePath =
System.getProperty("maxmind.database.country", DEFAULT_DATABASE_COUNTRY_PATH);
+ }
+
+ LOG.info("Geocode using Maxmind country database: " +
countryDatabasePath);
+
+ countryDatabaseReader = new DatabaseReader.Builder(new
File(countryDatabasePath)).build();
+ }
+
+ /**
+ * Gets the country code for the given IP
+ * @param ip String IP address
+ * @return String the country code
+ */
+ public final String getCountryCode(final String ip) {
+ // Only get country for non-internal IPs
+ if (ipUtil.getNeworkOrigin(ip) != IpUtil.NetworkOrigin.INTERNET) {
+ return UNKNOWN_COUNTRY_CODE;
+ }
+ try {
+ InetAddress ipAddress = InetAddress.getByName(ip);
+ CountryResponse response =
countryDatabaseReader.country(ipAddress);
+ Country country = response.getCountry();
+ String ret = country.getIsoCode();
+ if (ret == null) {
+ ret = UNKNOWN_COUNTRY_CODE;
+ }
+ return ret;
+ } catch (UnknownHostException hEx) {
+ LOG.warn(hEx);
+ return UNKNOWN_COUNTRY_CODE;
+ } catch (IOException iEx) {
+ LOG.warn(iEx);
+ return UNKNOWN_COUNTRY_CODE;
+ } catch (GeoIp2Exception gEx) {
+ LOG.warn(gEx);
+ return UNKNOWN_COUNTRY_CODE;
+ }
+ }
+
+ /**
+ * Translate a country code into the country name
+ *
+ * @param countryCode the country-code for which to get the country name
+ *
+ * @return String country name
+ */
+ public static String getCountryName(String countryCode) {
+ if (countryCode == null){
+ countryCode = "";
+ }
+ Locale l = new Locale("", countryCode);
+ String displayCountry = l.getDisplayCountry();
+ return displayCountry.equalsIgnoreCase(countryCode) ?
UNKNOWN_VALUE : displayCountry;
+ }
+}
diff --git
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/GeocodeISP.java
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/GeocodeISP.java
new file mode 100644
index 0000000..e1ae40c
--- /dev/null
+++
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/GeocodeISP.java
@@ -0,0 +1,156 @@
+/**
+ * Copyright (C) 2014 Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wikimedia.analytics.refinery.core;
+
+import com.maxmind.geoip2.DatabaseReader;
+import com.maxmind.geoip2.exception.GeoIp2Exception;
+import com.maxmind.geoip2.model.IspResponse;
+import org.apache.log4j.Logger;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Contains functions to find ISP information of an IP address using Maxmind's
GeoIP2-ISP
+ */
+public class GeocodeISP {
+ // Default paths to Maxmind ISP database
+ public static final String DEFAULT_DATABASE_ISP_PATH =
"/usr/share/GeoIP/GeoIP2-ISP.mmdb";
+
+ static final Logger LOG = Logger.getLogger(GeocodeISP.class.getName());
+
+ // Constants to hold the keys to use in data map
+ private static final String ISP = "isp";
+ private static final String ORGANISATION = "organization";
+ private static final String AUTONOMOUS_SYSTEM_ORGANIZATION =
"autonomous_system_organization";
+ private static final String AUTONOMOUS_SYSTEM_NUMBER =
"autonomous_system_number";
+
+ // Expected range is 0 to 4,294,967,295
+ // see https://en.wikipedia.org/wiki/Autonomous_system_(Internet)
+ private static final int UNKNOWN_AUTONOMOUS_SYSTEM_NUMBER = -1;
+ private static final String UNKNOWN_VALUE = "Unknown";
+
+ private DatabaseReader ispDatabaseReader;
+ private final IpUtil ipUtil = new IpUtil();
+
+
+ /**
+ * Constructs a GeocodeISP object with the default Maxmind 2 ISP database
path.
+ * You can override the default database path by setting the
'maxmind.database.isp' property.
+ */
+ public GeocodeISP() throws IOException {
+ this(null);
+ }
+
+ /**
+ * Constructs a GeocodeISP object with the provided Maxmind 2 ISP database
path.
+ * This path is 'optional', in that you may set it to null. If null, the
system
+ * properties 'maxmind.database.ISP' will be checked for paths.
+ * If it is not set, then this will default to DEFAULT_DATABASE_ISP_PATH.
+ *
+ * @param ispDatabasePath String path to Maxmind's ISP database
+ */
+ public GeocodeISP(String ispDatabasePath) throws IOException {
+ // Override database paths with System properties, if they exist
+ if (ispDatabasePath == null) {
+ ispDatabasePath = System.getProperty("maxmind.database.isp",
DEFAULT_DATABASE_ISP_PATH);
+ }
+
+ LOG.info("Geocode using Maxmind ISP database: " + ispDatabasePath);
+
+ ispDatabaseReader = new DatabaseReader.Builder(new
File(ispDatabasePath)).build();
+ }
+
+ /**
+ * Gets a map with ISP fields for the given IP
+ * @param ip String Ip address
+ * @return Map the map of ISP information
+ */
+ public final Map<String, Object> getISPData(final String ip) {
+
+ InetAddress ipAddress;
+ //Initialize map with default values
+ Map<String, Object> ispData = getDefaultMap();
+
+ try {
+ ipAddress = InetAddress.getByName(ip);
+ } catch (UnknownHostException hEx) {
+ LOG.warn(hEx);
+ return ispData;
+ }
+
+ // Only get ISP value for non-internal IPs
+ if (ipUtil.getNeworkOrigin(ip) != IpUtil.NetworkOrigin.INTERNET) {
+ return ispData;
+ }
+
+ IspResponse response;
+ try {
+ response = ispDatabaseReader.isp(ipAddress);
+ } catch (IOException iEx) {
+ LOG.warn(iEx);
+ return ispData;
+ } catch (GeoIp2Exception gEx) {
+ LOG.warn(gEx);
+ return ispData;
+ }
+
+ if (response == null)
+ return ispData;
+
+ String isp = response.getIsp();
+ if (isp != null) {
+ ispData.put(ISP, isp);
+ }
+
+ String organization = response.getOrganization();
+ if (organization != null) {
+ ispData.put(ORGANISATION, organization);
+ }
+
+ String autonomousSystemOrganization =
response.getAutonomousSystemOrganization();
+ if (autonomousSystemOrganization != null) {
+ ispData.put(AUTONOMOUS_SYSTEM_ORGANIZATION,
autonomousSystemOrganization);
+ }
+
+ Integer autonomousSystemNumber = response.getAutonomousSystemNumber();
+ if (autonomousSystemNumber != null) {
+ ispData.put(AUTONOMOUS_SYSTEM_NUMBER, autonomousSystemNumber);
+ }
+
+ return ispData;
+ }
+
+ /**
+ * Creates a new ISP map with default values for all fields
+ * @return Map the map of default ISP information (unknown)
+ */
+ private Map<String, Object> getDefaultMap() {
+ Map<String, Object> defaultISPData = new HashMap<>();
+ defaultISPData.put(ISP, UNKNOWN_VALUE);
+ defaultISPData.put(ORGANISATION, UNKNOWN_VALUE);
+ defaultISPData.put(AUTONOMOUS_SYSTEM_ORGANIZATION, UNKNOWN_VALUE);
+ defaultISPData.put(AUTONOMOUS_SYSTEM_NUMBER,
UNKNOWN_AUTONOMOUS_SYSTEM_NUMBER);
+
+ return defaultISPData;
+ }
+
+}
diff --git
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/IpUtil.java
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/IpUtil.java
index 7e5ef66..d1f78a2 100644
---
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/IpUtil.java
+++
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/IpUtil.java
@@ -27,8 +27,10 @@
* List of trusted proxies
* <p>
* The following trusted proxies list is sourced from
- *
https://phabricator.wikimedia.org/source/operations-puppet/browse/master/manifests/network.pp;9f97e3c2c5bc012ba5c3751f13fd838a06d6528d$14
+ *
https://github.com/wikimedia/puppet/blob/production/modules/network/data/data.yaml
* For now, any updates to this source must be manually brought over here.
+ *
+ * Last update: 2018-01-12
*/
final String[] trustedProxies = new String[] {
"91.198.174.0/24",
@@ -37,43 +39,52 @@
"198.35.26.0/23",
"185.15.56.0/22",
"2a02:ec80::/32",
- "10.0.0.0/8"
+ "2001:df2:e500::/48",
+ "103.102.166.0/24",
+
+ "10.0.0.0/8" // Internal subnet
};
+
+
+
Set<IpAddressMatcher> trustedProxiesCache;
/**
* List of Wikimedia Labs subnets
* <p>
- * The following list is sourced from ops/puppet.git's
- * $all_network_subnets global variable. Specifically these were taken
- * from manifests/network.pp at git hash bc1d7ef.
- * @see
https://phabricator.wikimedia.org/diffusion/OPUP/browse/production/manifests/network.pp
+ * The following list is sourced from ops/puppet.git's data.yaml file (see
above)
+ * Specifically these were takenthe from the labs portion
+ * See
@https://github.com/wikimedia/puppet/blob/production/modules/network/data/data.yaml#L235
+ *
+ * Last updated: 2018-01-12
*/
final String[] labsSubnets = new String[] {
- // labs-instances1-a-eqiad
+ // labs-instances-eqiad
"10.68.0.0/24",
"2620:0:861:201::/64",
- // labs-instances1-b-eqiad
+
"10.68.16.0/21",
"2620:0:861:202::/64",
- // labs-instances1-c-eqiad
+
"10.68.32.0/24",
"2620:0:861:203::/64",
- // labs-instances1-d-eqiad
+
"10.68.48.0/24",
"2620:0:861:204::/64",
- // labs-hosts1-a-eqiad
- "10.64.4.0/24",
- "2620:0:861:117::/64",
- // labs-hosts1-b-eqiad
- "10.64.20.0/24",
- "2620:0:861:118::/64",
- // labs-hosts1-d-eqiad
- "10.64.52.0/24",
- // labs-support1-c-eqiad
- "10.64.37.0/24",
- "2620:0:861:119::/64"
+
+ // labs-instances-codfw
+ "10.196.0.0/24",
+ "2620:0:860:201::/64",
+
+ "10.196.16.0/21",
+ "2620:0:860:202::/64",
+
+ "10.196.32.0/24",
+ "2620:0:860:203::/64",
+
+ "10.196.48.0/24",
+ "2620:0:860:204::/64",
};
Set<IpAddressMatcher> labsSubnetsCache;
diff --git
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestGeocode.java
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestGeocodeCity.java
similarity index 74%
rename from
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestGeocode.java
rename to
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestGeocodeCity.java
index 9539a8e..16b6899 100644
---
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestGeocode.java
+++
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestGeocodeCity.java
@@ -20,28 +20,20 @@
import java.io.IOException;
import java.util.Map;
-public class TestGeocode extends TestCase {
+public class TestGeocodeCity extends TestCase {
- private Geocode geocode;
+ private GeocodeCity geocodeCity;
@BeforeClass
public void setUp() throws IOException {
- geocode = new Geocode();
- }
-
- public void testGeoCountryLookup() {
- //IPv4 addresses taken from Maxmind's test suite
- String ip = "81.2.69.160";
- assertEquals("GB", geocode.getCountryCode(ip));
- assertEquals("--", geocode.getCountryCode("-"));
- assertEquals("--", geocode.getCountryCode(null));
+ geocodeCity = new GeocodeCity();
}
public void testGeoDataLookupIPv4() {
//IPv4 addresses taken from Maxmind's test suite
String ip = "81.2.69.160";
- Map<String, Object> geoData = geocode.getGeocodedData(ip);
+ Map<String, Object> geoData = geocodeCity.getGeocodedData(ip);
assertNotNull("Geo data cannot be null", geoData);
assertEquals("Europe", geoData.get("continent"));
assertEquals("GB", geoData.get("country_code"));
@@ -58,7 +50,7 @@
//IPv6 representation of an IPv4 address taken from Maxmind's test
suite
String ip = "::ffff:81.2.69.160";
- Map<String, Object> geoData = geocode.getGeocodedData(ip);
+ Map<String, Object> geoData = geocodeCity.getGeocodedData(ip);
assertNotNull("Geo data cannot be null", geoData);
assertEquals("Europe", geoData.get("continent"));
assertEquals("GB", geoData.get("country_code"));
@@ -75,7 +67,7 @@
// Invalid or unknown IP address
String ip = "-";
- Map<String, Object> geoData = geocode.getGeocodedData(ip);
+ Map<String, Object> geoData = geocodeCity.getGeocodedData(ip);
assertNotNull("Geo data cannot be null", geoData);
assertEquals("--", geoData.get("country_code"));
assertEquals("Unknown", geoData.get("continent"));
@@ -93,7 +85,7 @@
// Invalid IP address
String ip = null;
- Map<String, Object> geoData = geocode.getGeocodedData(ip);
+ Map<String, Object> geoData = geocodeCity.getGeocodedData(ip);
assertNotNull("Geo data cannot be null", geoData);
assertEquals("Unknown", geoData.get("continent"));
assertEquals("--", geoData.get("country_code"));
@@ -106,17 +98,4 @@
assertEquals("Unknown", geoData.get("timezone"));
}
- public void testGetKnownCountryName() {
- assertEquals("Ireland", Geocode.getCountryName("IE"));
- assertEquals("Ireland", Geocode.getCountryName("ie"));
- }
-
- public void testGetUnknownCountryName() {
- assertEquals("Unknown", Geocode.getCountryName("-"));
- assertEquals("Unknown", Geocode.getCountryName("--"));
- assertEquals("Unknown", Geocode.getCountryName("XX"));
- assertEquals("Unknown", Geocode.getCountryName("XXX"));
- assertEquals("Unknown", Geocode.getCountryName("ct"));
- assertEquals("Unknown", Geocode.getCountryName(null));
- }
}
diff --git
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestGeocodeCountry.java
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestGeocodeCountry.java
new file mode 100644
index 0000000..8bc02b0
--- /dev/null
+++
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestGeocodeCountry.java
@@ -0,0 +1,53 @@
+// Copyright 2014 Wikimedia Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.wikimedia.analytics.refinery.core;
+
+import junit.framework.TestCase;
+import org.junit.BeforeClass;
+
+import java.io.IOException;
+import java.util.Map;
+
+public class TestGeocodeCountry extends TestCase {
+
+ private GeocodeCountry geocodeCountry;
+
+ @BeforeClass
+ public void setUp() throws IOException {
+ geocodeCountry = new GeocodeCountry();
+ }
+
+ public void testGeoCountryLookup() {
+ //IPv4 addresses taken from Maxmind's test suite
+ String ip = "81.2.69.160";
+ assertEquals("GB", geocodeCountry.getCountryCode(ip));
+ assertEquals("--", geocodeCountry.getCountryCode("-"));
+ assertEquals("--", geocodeCountry.getCountryCode(null));
+ }
+
+ public void testGetKnownCountryName() {
+ assertEquals("Ireland", GeocodeCountry.getCountryName("IE"));
+ assertEquals("Ireland", GeocodeCountry.getCountryName("ie"));
+ }
+
+ public void testGetUnknownCountryName() {
+ assertEquals("Unknown", GeocodeCountry.getCountryName("-"));
+ assertEquals("Unknown", GeocodeCountry.getCountryName("--"));
+ assertEquals("Unknown", GeocodeCountry.getCountryName("XX"));
+ assertEquals("Unknown", GeocodeCountry.getCountryName("XXX"));
+ assertEquals("Unknown", GeocodeCountry.getCountryName("ct"));
+ assertEquals("Unknown", GeocodeCountry.getCountryName(null));
+ }
+}
diff --git
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestGeocodeISP.java
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestGeocodeISP.java
new file mode 100644
index 0000000..349d0d8
--- /dev/null
+++
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestGeocodeISP.java
@@ -0,0 +1,80 @@
+// Copyright 2014 Wikimedia Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.wikimedia.analytics.refinery.core;
+
+import junit.framework.TestCase;
+import org.junit.BeforeClass;
+
+import java.io.IOException;
+import java.util.Map;
+
+public class TestGeocodeISP extends TestCase {
+
+ private GeocodeISP geocodeISP;
+
+ @BeforeClass
+ public void setUp() throws IOException {
+ geocodeISP = new GeocodeISP();
+ }
+
+ public void testDoISPDataLookupIPv4() {
+ //IPv4 addresses taken from Maxmind's test suite
+ String ip = "82.99.17.96";
+
+ Map<String, Object> ispData = geocodeISP.getISPData(ip);
+ assertNotNull("ISP data cannot be null", ispData);
+ assertEquals("IP-Only Telecommunication Networks AB",
ispData.get("isp"));
+ assertEquals("Effectiv Solutions", ispData.get("organization"));
+ assertEquals("IP-Only", ispData.get("autonomous_system_organization"));
+ assertEquals(12552, ispData.get("autonomous_system_number"));
+ }
+
+ public void testDoISPLookupIpv6() {
+ //IPv6 representation of an IPv4 address taken from Maxmind's test
suite
+ String ip = "::ffff:82.99.17.96";
+
+ Map<String, Object> ispData = geocodeISP.getISPData(ip);
+ assertNotNull("ISP data cannot be null", ispData);
+ assertEquals("IP-Only Telecommunication Networks AB",
ispData.get("isp"));
+ assertEquals("Effectiv Solutions", ispData.get("organization"));
+ assertEquals("IP-Only", ispData.get("autonomous_system_organization"));
+ assertEquals(12552, ispData.get("autonomous_system_number"));
+ }
+
+ public void testDoGeoLookupIpUnknown() {
+ // Invalid or unknown IP address
+ String ip = "-";
+
+ Map<String, Object> ispData = geocodeISP.getISPData(ip);
+ assertNotNull("ISP data cannot be null", ispData);
+ assertEquals("Unknown", ispData.get("isp"));
+ assertEquals("Unknown", ispData.get("organization"));
+ assertEquals("Unknown", ispData.get("autonomous_system_organization"));
+ assertEquals(-1, ispData.get("autonomous_system_number"));
+ }
+
+ public void testDoGeoLookupWithNull() {
+ // Invalid IP address
+ String ip = null;
+
+ Map<String, Object> ispData = geocodeISP.getISPData(ip);
+ assertNotNull("ISP data cannot be null", ispData);
+ assertEquals("Unknown", ispData.get("isp"));
+ assertEquals("Unknown", ispData.get("organization"));
+ assertEquals("Unknown", ispData.get("autonomous_system_organization"));
+ assertEquals(-1, ispData.get("autonomous_system_number"));
+ }
+
+}
diff --git a/refinery-core/src/test/resources/GeoIP2-ISP-Test.mmdb
b/refinery-core/src/test/resources/GeoIP2-ISP-Test.mmdb
new file mode 100644
index 0000000..e066bbb
--- /dev/null
+++ b/refinery-core/src/test/resources/GeoIP2-ISP-Test.mmdb
Binary files differ
diff --git a/refinery-hive/pom.xml b/refinery-hive/pom.xml
index 0e4503d..7c0b5e8 100644
--- a/refinery-hive/pom.xml
+++ b/refinery-hive/pom.xml
@@ -92,6 +92,7 @@
<systemPropertyVariables>
<maxmind.database.country>${project.build.testOutputDirectory}/GeoIP2-Country-Test.mmdb</maxmind.database.country>
<maxmind.database.city>${project.build.testOutputDirectory}/GeoIP2-City-Test.mmdb</maxmind.database.city>
+
<maxmind.database.isp>${project.build.testOutputDirectory}/GeoIP2-ISP-Test.mmdb</maxmind.database.isp>
</systemPropertyVariables>
</configuration>
</plugin>
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedCountryUDF.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedCountryUDF.java
index 46db093..2b647f6 100644
---
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedCountryUDF.java
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GeocodedCountryUDF.java
@@ -28,11 +28,10 @@
* CREATE TEMPORARY FUNCTION geocode_country as
'org.wikimedia.analytics.refinery.hive.GeocodedCountryUDF';
* SELECT geocode_country(ip) from webrequest where year = 2014 limit 10;
*
- * The above steps assume that the two required files - GeoIP2-Country.mmdb
and GeoIP2-City.mmdb - are available
- * in their default path /usr/share/GeoIP. If not, then add the following
steps:
+ * The above steps assume that the required file GeoIP2-Country.mmdb is
available
+ * in its default path /usr/share/GeoIP. If not, then add the following steps:
*
* SET maxmind.database.country=/path/to/GeoIP2-Country.mmdb;
- * SET maxmind.database.city=/path/to/GeoIP2-City.mmdb;
*/
@Deprecated
@UDFType(deterministic = true)
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetCountryISOCodeUDF.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetCountryISOCodeUDF.java
index bb4808d..dd05c79 100644
---
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetCountryISOCodeUDF.java
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetCountryISOCodeUDF.java
@@ -36,7 +36,7 @@
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
-import org.wikimedia.analytics.refinery.core.Geocode;
+import org.wikimedia.analytics.refinery.core.GeocodeCountry;
/**
* A Hive UDF to lookup country codes from IP addresses.
@@ -46,11 +46,10 @@
* CREATE TEMPORARY FUNCTION get_country_iso as
'org.wikimedia.analytics.refinery.hive.GetCountryISOCodeUDF';
* SELECT get_country_iso(ip) from webrequest where year = 2014 limit 10;
*
- * The above steps assume that the two required files - GeoIP2-Country.mmdb
and GeoIP2-City.mmdb - are available
- * in their default path /usr/share/GeoIP. If not, then add the following
steps:
+ * The above steps assume that the required file GeoIP2-Country.mmdb is
available
+ * in its default path /usr/share/GeoIP. If not, then add the following steps:
*
* SET maxmind.database.country=/path/to/GeoIP2-Country.mmdb;
- * SET maxmind.database.city=/path/to/GeoIP2-City.mmdb;
*/
@UDFType(deterministic = true)
@Description(
@@ -61,7 +60,7 @@
private final Text result = new Text();
private ObjectInspector argumentOI;
- private Geocode geocode;
+ private GeocodeCountry geocodeCountry;
static final Logger LOG =
Logger.getLogger(GetCountryISOCodeUDF.class.getName());
@@ -96,12 +95,11 @@
@Override
public void configure(MapredContext context) {
- if (geocode == null) {
+ if (geocodeCountry == null) {
try {
JobConf jobConf = context.getJobConf();
- geocode = new Geocode(
- jobConf.getTrimmed("maxmind.database.country"),
- jobConf.getTrimmed("maxmind.database.city")
+ geocodeCountry = new GeocodeCountry(
+ jobConf.getTrimmed("maxmind.database.country")
);
} catch (IOException ex) {
LOG.error(ex);
@@ -115,13 +113,13 @@
@SuppressWarnings("unchecked")
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
- assert geocode != null : "Evaluate called without initializing
'geocode'";
+ assert geocodeCountry != null : "Evaluate called without initializing
'geocodeCountry'";
result.clear();
if (arguments.length == 1 && argumentOI != null && arguments[0] !=
null) {
String ip = ((StringObjectInspector)
argumentOI).getPrimitiveJavaObject(arguments[0].get());
- result.set(geocode.getCountryCode(ip));
+ result.set(geocodeCountry.getCountryCode(ip));
}
return result;
}
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetCountryNameUDF.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetCountryNameUDF.java
index b45e9b6..0204ecc 100644
---
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetCountryNameUDF.java
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetCountryNameUDF.java
@@ -27,7 +27,7 @@
import
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.io.Text;
import org.apache.log4j.Logger;
-import org.wikimedia.analytics.refinery.core.Geocode;
+import org.wikimedia.analytics.refinery.core.GeocodeCountry;
/**
* A Hive UDF to lookup country name from country code.
@@ -78,7 +78,7 @@
public Object evaluate(DeferredObject[] arguments) throws HiveException {
result.clear();
String countryCode =
argumentOI.getPrimitiveJavaObject(arguments[0].get());
- result.set(Geocode.getCountryName(countryCode));
+ result.set(GeocodeCountry.getCountryName(countryCode));
return result;
}
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetGeoDataUDF.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetGeoDataUDF.java
index b59ae0f..72aae25 100644
---
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetGeoDataUDF.java
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetGeoDataUDF.java
@@ -32,7 +32,8 @@
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
-import org.wikimedia.analytics.refinery.core.Geocode;
+import org.wikimedia.analytics.refinery.core.GeocodeCity;
+import org.wikimedia.analytics.refinery.core.GeocodeCountry;
import java.io.IOException;
import java.util.HashMap;
@@ -46,10 +47,9 @@
* CREATE TEMPORARY FUNCTION get_geo_data as
'org.wikimedia.analytics.refinery.hive.GetGeoDataUDF';
* SELECT get_geo_data(ip)['country'], get_geo_data(ip)['city'] from
webrequest where year = 2014 limit 10;
*
- * The above steps assume that the two required files - GeoIP2-Country.mmdb
and GeoIP2-City.mmdb - are available
- * in their default path /usr/share/GeoIP. If not, then add the following
steps:
+ * The above steps assume that the required file GeoIP2-City.mmdb is available
+ * in its default path /usr/share/GeoIP. If not, then add the following steps:
*
- * SET maxmind.database.country=/path/to/GeoIP2-Country.mmdb;
* SET maxmind.database.city=/path/to/GeoIP2-City.mmdb;
*/
@UDFType(deterministic = true)
@@ -60,7 +60,7 @@
Map<String, String> result;
private ObjectInspector argumentOI;
- private Geocode geocode;
+ private GeocodeCity geocodeCity;
static final Logger LOG = Logger.getLogger(GetGeoDataUDF.class.getName());
@@ -103,7 +103,7 @@
argumentOI = arg1;
- result = new HashMap<String, String>();
+ result = new HashMap<>();
return ObjectInspectorFactory.getStandardMapObjectInspector(
PrimitiveObjectInspectorFactory.javaStringObjectInspector,
@@ -112,12 +112,10 @@
@Override
public void configure(MapredContext context) {
- if (geocode == null) {
+ if (geocodeCity == null) {
try {
JobConf jobConf = context.getJobConf();
- geocode = new Geocode(
- jobConf.getTrimmed("maxmind.database.country"),
- jobConf.getTrimmed("maxmind.database.city")
+ geocodeCity = new
GeocodeCity(jobConf.getTrimmed("maxmind.database.city")
);
} catch (IOException ex) {
LOG.error(ex);
@@ -148,13 +146,13 @@
@SuppressWarnings("unchecked")
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
- assert geocode != null : "Evaluate called without initializing
'geocode'";
+ assert geocodeCity != null : "Evaluate called without initializing
'geocodeCity'";
result.clear();
if (arguments.length == 1 && argumentOI != null && arguments[0] !=
null) {
String ip = ((StringObjectInspector)
argumentOI).getPrimitiveJavaObject(arguments[0].get());
- Map<String, Object> geoDataResult = geocode.getGeocodedData(ip);
+ Map<String, Object> geoDataResult =
geocodeCity.getGeocodedData(ip);
if (geoDataResult != null) {
for (String field : geoDataResult.keySet()) {
Object value = geoDataResult.get(field);
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetISPDataUDF.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetISPDataUDF.java
new file mode 100644
index 0000000..bdf377e
--- /dev/null
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetISPDataUDF.java
@@ -0,0 +1,170 @@
+/**
+ * Copyright (C) 2014 Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wikimedia.analytics.refinery.hive;
+
+import org.apache.hadoop.hive.ql.exec.*;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
+import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.log4j.Logger;
+import org.wikimedia.analytics.refinery.core.GeocodeCity;
+import org.wikimedia.analytics.refinery.core.GeocodeISP;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * A Hive UDF to lookup ISP fields from IP addresses.
+ * <p>
+ * Hive Usage:
+ * ADD JAR /path/to/refinery-hive.jar;
+ * CREATE TEMPORARY FUNCTION get_isp_data as
'org.wikimedia.analytics.refinery.hive.GetISPDataUDF';
+ * SELECT get_isp_data(ip)['isp'], get_isp_data(ip)['organization'] from
webrequest where year = 2014 limit 10;
+ *
+ * The above steps assume that the required file GeoIP2-ISP.mmdb is available
+ * in its default path /usr/share/GeoIP. If not, then add the following steps:
+ *
+ * SET maxmind.database.isp=/path/to/GeoIP2-ISP.mmdb;
+ */
+@UDFType(deterministic = true)
+@Description(name = "get_isp_data", value = "_FUNC_(ip) - "
+ + "Returns a map with isp, organization,
autonomous_system_organization, autonomous_system_number "
+ + "keys and the appropriate values for each of them")
+public class GetISPDataUDF extends GenericUDF {
+
+ Map<String, String> result;
+ private ObjectInspector argumentOI;
+ private GeocodeISP geocodeISP;
+
+ static final Logger LOG = Logger.getLogger(GetISPDataUDF.class.getName());
+
+ /**
+ * The initialize method is called only once during the lifetime of the
UDF.
+ * <p/>
+ * Method checks for the validity (number, type, etc)
+ * of the arguments being passed to the UDF.
+ * It also sets the return type of the result of the UDF,
+ * in this case the ObjectInspector equivalent of
+ * Map<String,String>
+ *
+ * @param arguments
+ * @return ObjectInspector Map<String,String>
+ * @throws UDFArgumentException
+ */
+ @Override
+ public ObjectInspector initialize(ObjectInspector[] arguments)
+ throws UDFArgumentException {
+
+ if (arguments.length != 1) {
+ throw new UDFArgumentLengthException("The GetISPDataUDF takes an
array with only 1 element as argument");
+ }
+
+ ObjectInspector arg1 = arguments[0];
+
+ if (arg1.getCategory() != Category.PRIMITIVE) {
+ throw new UDFArgumentTypeException(0,
+ "A string argument was expected but an argument of type "
+ arg1.getTypeName()
+ + " was given.");
+ }
+
+ PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector)
arg1).getPrimitiveCategory();
+
+ if (primitiveCategory != PrimitiveCategory.STRING) {
+ throw new UDFArgumentTypeException(0,
+ "A string argument was expected but an argument of type "
+ arg1.getTypeName()
+ + " was given.");
+ }
+
+ argumentOI = arg1;
+
+ result = new HashMap<>();
+
+ return ObjectInspectorFactory.getStandardMapObjectInspector(
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector,
+ PrimitiveObjectInspectorFactory.javaStringObjectInspector);
+ }
+
+ @Override
+ public void configure(MapredContext context) {
+ if (geocodeISP == null) {
+ try {
+ JobConf jobConf = context.getJobConf();
+ geocodeISP = new
GeocodeISP(jobConf.getTrimmed("maxmind.database.isp")
+ );
+ } catch (IOException ex) {
+ LOG.error(ex);
+ throw new RuntimeException(ex);
+ }
+ }
+
+ super.configure(context);
+ }
+
+ /**
+ * Takes the actual arguments and returns the result.
+ * Gets passed the input, does whatever it wants to it,
+ * and then returns the output.
+ * <p/>
+ * The input is accessed using the ObjectInspectors that
+ * were saved into global variables in the call to initialize()
+ * <p/>
+ * This method is called once for every row of data being processed.
+ * UDFs are called during the map phase of the MapReduce job.
+ * This means that we have no control over the order in which the
+ * records get sent to the UDF.
+ *
+ * @param arguments
+ * @return Object Map<String, String>
+ * @throws HiveException
+ */
+ @SuppressWarnings("unchecked")
+ @Override
+ public Object evaluate(DeferredObject[] arguments) throws HiveException {
+ assert geocodeISP != null : "Evaluate called without initializing
'geocodeISP'";
+
+ result.clear();
+
+ if (arguments.length == 1 && argumentOI != null && arguments[0] !=
null) {
+ String ip = ((StringObjectInspector)
argumentOI).getPrimitiveJavaObject(arguments[0].get());
+ Map<String, Object> ispDataResult = geocodeISP.getISPData(ip);
+ if (ispDataResult != null) {
+ for (String field : ispDataResult.keySet()) {
+ Object value = ispDataResult.get(field);
+ if (value != null) {
+ result.put(field, value.toString());
+ }
+ }
+ }
+ }
+ return result;
+ }
+
+ @Override
+ public String getDisplayString(String[] arguments) {
+ assert (arguments.length == 1);
+ return "get_isp_data(" + arguments[0] + ")";
+ }
+}
diff --git
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetISPDataUDF.java
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetISPDataUDF.java
new file mode 100644
index 0000000..d656782
--- /dev/null
+++
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetISPDataUDF.java
@@ -0,0 +1,113 @@
+/**
+ * Copyright (C) 2014 Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.wikimedia.analytics.refinery.hive;
+
+import org.apache.hadoop.hive.ql.exec.MapredContext;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.mapred.JobConf;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+
+public class TestGetISPDataUDF {
+
+ @Test(expected = UDFArgumentLengthException.class)
+ public void testBadNumberOfArguments() throws HiveException {
+ ObjectInspector value1 =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ ObjectInspector value2 =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ ObjectInspector[] initArguments = new ObjectInspector[]{value1,
value2};
+ GetISPDataUDF getISPDataUDF = new GetISPDataUDF();
+ getISPDataUDF.initialize(initArguments);
+ }
+
+ @Test(expected = UDFArgumentTypeException.class)
+ public void testWrongTypeOfArguments() throws HiveException {
+ ObjectInspector value1 =
PrimitiveObjectInspectorFactory.javaIntObjectInspector;
+ ObjectInspector[] initArguments = new ObjectInspector[]{value1};
+ GetISPDataUDF getISPDataUDF = new GetISPDataUDF();
+ getISPDataUDF.initialize(initArguments);
+ }
+
+ /*
+ * Following tests data is the same as the one in refinery-core
TestGeocodeISP
+ */
+ @Test
+ public void testEvaluateWithValidIPv4() throws HiveException, IOException {
+ //IPv4 addresses taken from Maxmind's test suite
+ String ip = "82.99.17.96";
+ Map<String, String> result = evaluate (ip);
+
+ assertEquals("ISP check", "IP-Only Telecommunication Networks AB",
result.get("isp"));
+ assertEquals("Organization check", "Effectiv Solutions",
result.get("organization"));
+ assertEquals("Autonomous-system-organization check", "IP-Only",
result.get("autonomous_system_organization"));
+ assertEquals("Autonomous-system-number check", "12552",
result.get("autonomous_system_number"));
+ }
+
+ @Test
+ public void testEvaluateWithValidIPv6() throws HiveException, IOException {
+ //IPv6 representation of an IPv4 address taken from Maxmind's test
suite
+ String ip = "::ffff:82.99.17.96";
+ Map<String, String> result = evaluate (ip);
+
+ assertEquals("ISP check", "IP-Only Telecommunication Networks AB",
result.get("isp"));
+ assertEquals("Organization check", "Effectiv Solutions",
result.get("organization"));
+ assertEquals("Autonomous-system-organization check", "IP-Only",
result.get("autonomous_system_organization"));
+ assertEquals("Autonomous-system-number check", "12552",
result.get("autonomous_system_number"));
+ }
+
+ @Test
+ public void testEvaluateWithInvalidIPs() throws HiveException, IOException
{
+ //Invalid IP
+ String ip = "-";
+ Map<String, String> result = evaluate(ip);
+
+ assertEquals("ISP check", "Unknown", result.get("isp"));
+ assertEquals("Organization check", "Unknown",
result.get("organization"));
+ assertEquals("Autonomous-system-organization check", "Unknown",
result.get("autonomous_system_organization"));
+ assertEquals("Autonomous-system-number check", "-1",
result.get("autonomous_system_number"));
+
+ ip = null;
+ result = evaluate(ip);
+
+ assertEquals("ISP check", "Unknown", result.get("isp"));
+ assertEquals("Organization check", "Unknown",
result.get("organization"));
+ assertEquals("Autonomous-system-organization check", "Unknown",
result.get("autonomous_system_organization"));
+ assertEquals("Autonomous-system-number check", "-1",
result.get("autonomous_system_number"));
+ }
+
+ private Map<String, String> evaluate(String ip) throws HiveException,
IOException {
+ ObjectInspector value1 =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ ObjectInspector[] initArguments = new ObjectInspector[]{value1};
+ GetISPDataUDF getISPDataUDF = new GetISPDataUDF();
+
+ getISPDataUDF.initialize(initArguments);
+ getISPDataUDF.configure(MapredContext.init(false, new JobConf()));
+
+ DeferredObject[] args = new DeferredObject[] { new
DeferredJavaObject(ip) };
+ Map<String, String> result = (Map<String,
String>)getISPDataUDF.evaluate(args);
+ getISPDataUDF.close();
+ return result;
+ }
+}
diff --git a/refinery-hive/src/test/resources/GeoIP2-City-Test.mmdb
b/refinery-hive/src/test/resources/GeoIP2-City-Test.mmdb
deleted file mode 100644
index 2e271fe..0000000
--- a/refinery-hive/src/test/resources/GeoIP2-City-Test.mmdb
+++ /dev/null
Binary files differ
diff --git a/refinery-hive/src/test/resources/GeoIP2-City-Test.mmdb
b/refinery-hive/src/test/resources/GeoIP2-City-Test.mmdb
new file mode 120000
index 0000000..d946926
--- /dev/null
+++ b/refinery-hive/src/test/resources/GeoIP2-City-Test.mmdb
@@ -0,0 +1 @@
+../../../../refinery-core/src/test/resources/GeoIP2-City-Test.mmdb
\ No newline at end of file
diff --git a/refinery-hive/src/test/resources/GeoIP2-Country-Test.mmdb
b/refinery-hive/src/test/resources/GeoIP2-Country-Test.mmdb
deleted file mode 100644
index 635ee70..0000000
--- a/refinery-hive/src/test/resources/GeoIP2-Country-Test.mmdb
+++ /dev/null
Binary files differ
diff --git a/refinery-hive/src/test/resources/GeoIP2-Country-Test.mmdb
b/refinery-hive/src/test/resources/GeoIP2-Country-Test.mmdb
new file mode 120000
index 0000000..7e0d1da
--- /dev/null
+++ b/refinery-hive/src/test/resources/GeoIP2-Country-Test.mmdb
@@ -0,0 +1 @@
+../../../../refinery-core/src/test/resources/GeoIP2-Country-Test.mmdb
\ No newline at end of file
diff --git a/refinery-hive/src/test/resources/GeoIP2-ISP-Test.mmdb
b/refinery-hive/src/test/resources/GeoIP2-ISP-Test.mmdb
new file mode 120000
index 0000000..e4351ff
--- /dev/null
+++ b/refinery-hive/src/test/resources/GeoIP2-ISP-Test.mmdb
@@ -0,0 +1 @@
+../../../../refinery-core/src/test/resources/GeoIP2-ISP-Test.mmdb
\ No newline at end of file
--
To view, visit https://gerrit.wikimedia.org/r/403916
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I602b04847d6083d8ba4a4de3c6614d8952d83608
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits