Mwalker has uploaded a new change for review.
https://gerrit.wikimedia.org/r/77462
Change subject: Adding GeoNames Tools
......................................................................
Adding GeoNames Tools
Change-Id: I985b5e7d86d8b421fdbf7cd55b3b675754989e82
---
A GeonameUpdater/.gitignore
A GeonameUpdater/geonames.cfg
A GeonameUpdater/geonames_updater.py
3 files changed, 418 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/wikimedia/fundraising/tools
refs/changes/62/77462/1
diff --git a/GeonameUpdater/.gitignore b/GeonameUpdater/.gitignore
new file mode 100644
index 0000000..2211df6
--- /dev/null
+++ b/GeonameUpdater/.gitignore
@@ -0,0 +1 @@
+*.txt
diff --git a/GeonameUpdater/geonames.cfg b/GeonameUpdater/geonames.cfg
new file mode 100644
index 0000000..7584c3c
--- /dev/null
+++ b/GeonameUpdater/geonames.cfg
@@ -0,0 +1,5 @@
+[MySQL]
+host:
+user:
+password:
+geonamesDB: geonames
diff --git a/GeonameUpdater/geonames_updater.py
b/GeonameUpdater/geonames_updater.py
new file mode 100644
index 0000000..1b4f432
--- /dev/null
+++ b/GeonameUpdater/geonames_updater.py
@@ -0,0 +1,412 @@
+#!/usr/bin/python2
+
+from ConfigParser import SafeConfigParser
+from optparse import OptionParser
+import dateutil.parser
+import pytz
+import MySQLdb as MySQL
+import sys
+import os
+import re
+import codecs
+
+_config = None
+_geonamesDB = None
+
+def install_schema():
+ global _geonamesDB
+ cur = _geonamesDB.cursor()
+
+ cur.execute("DROP TABLE IF EXISTS geonames;")
+ cur.execute("""
+ CREATE TABLE geonames (
+ geonameid INT UNSIGNED NOT NULL PRIMARY KEY,
+ name VARCHAR(200),
+ ascii_name VARCHAR(200),
+ latitude DOUBLE,
+ longitude DOUBLE,
+ feature_type_id INT UNSIGNED,
+ country_code CHAR(2),
+ admin1 VARCHAR(20),
+ admin2 VARCHAR(80),
+ admin3 VARCHAR(20),
+ admin4 VARCHAR(20),
+ population BIGINT,
+ tzid INT UNSIGNED,
+ last_modification DATE,
+
+ INDEX idx_name (geonameid),
+ INDEX idx_ascii_name (ascii_name),
+ INDEX idx_lat (latitude),
+ INDEX idx_long (longitude),
+ INDEX idx_feature_type (feature_type_id),
+ INDEX idx_country_code (country_code),
+ INDEX idx_admin1 (country_code, admin1),
+ INDEX idx_admin12 (country_code, admin1, admin2)
+ ) ENGINE InnoDB CHARACTER SET utf8;
+ """)
+
+ cur.execute("DROP TABLE IF EXISTS altnames;")
+ cur.execute("""
+ CREATE TABLE altnames (
+ int_altnameid INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ altnameid INT UNSIGNED,
+ geonameid INT UNSIGNED NOT NULL,
+ format VARCHAR(8),
+ altname VARCHAR(200),
+ is_preferred TINYINT(1) DEFAULT 0,
+ is_short TINYINT(1) DEFAULT 0,
+ is_colloquial TINYINT(1) DEFAULT 0,
+ is_historic TINYINT(1) DEFAULT 0,
+
+ INDEX idx_altnameid (altnameid),
+ INDEX idx_geonameid (geonameid),
+ INDEX idx_altname (altname),
+ INDEX idx_format (format)
+ ) ENGINE InnoDB CHARACTER SET utf8;
+ """)
+
+ cur.execute("DROP TABLE IF EXISTS timezones;")
+ cur.execute("""
+ CREATE TABLE timezones (
+ tzid INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ country_code CHAR(2),
+ tzname VARCHAR(40),
+ offset FLOAT,
+
+ INDEX idx_cc (country_code),
+ INDEX idx_timezone (tzname),
+ INDEX idx_offset (offset)
+ ) ENGINE InnoDB CHARACTER SET utf8;
+ """)
+
+ cur.execute("DROP TABLE IF EXISTS feature_types;")
+ cur.execute("""
+ CREATE TABLE feature_types (
+ ftid INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ class CHAR(1),
+ code VARCHAR(20),
+ name VARCHAR(100),
+ description VARCHAR(500),
+
+ INDEX idx_feature_class (class),
+ INDEX idx_feature_code (code),
+ INDEX idx_feature_composite (class, code)
+ ) ENGINE InnoDB CHARACTER SET utf8;
+ """)
+
+ cur.close()
+ write("Done!\n")
+
+
+def import_timezone_file(filename):
+ global _geonamesDB
+
+ # === Prepare things ===
+ try:
+ f = open(filename, 'r')
+ except (IOError):
+ writeErr("Could not open %s for read!" % filename)
+ return
+
+ # --- Clear the table ---
+ write( "Truncating table... " )
+ cur = _geonamesDB.cursor()
+ cur.execute("TRUNCATE timezones")
+
+ # === Import data ===
+ write("Importing... ")
+ count = 0
+
+ lre = re.compile(r"""^
+ (?P<countryCode>[^\t]*)\t
+ (?P<tzName>[^\t]*)\t
+ (?P<gmtOffset>[^\t]*)\t
+ (?P<dstOffset>[^\t]*)\t
+ (?P<utcOffset>[^\t]*)$
+ """, re.VERBOSE)
+
+ line = f.readline().strip() # The first line is a header
+ if line != "CountryCode TimeZoneId GMT offset 1. Jan 2013 DST
offset 1. Jul 2013 rawOffset (independant of DST)":
+ writeErr("File %s does not have the correct header!" % filename)
+ else:
+ for line in f:
+ line = line[:-1]
+ m = lre.match(line)
+ if not m:
+ writeErr("Line '%s' could not be parsed!" % line)
+ continue
+
+ cc = m.group('countryCode')
+ name = m.group('tzName')
+ offset = float(m.group('utcOffset'))
+
+ cur.execute("INSERT INTO timezones (country_code, tzname, offset)
VALUES (%s, %s, %s);", (cc, name, offset))
+ count = count + 1
+
+ cur.close()
+ f.close()
+ write("Done (%s records)!\n" % count)
+
+
+def import_features_file(filename):
+ global _geonamesDB
+
+ # === Prepare things ===
+ try:
+ f = open(filename, 'r')
+ except (IOError):
+ writeErr("Could not open %s for read!" % filename)
+ return
+
+ # --- Clear the table ---
+ write( "Truncating table... " )
+ cur = _geonamesDB.cursor()
+ cur.execute("TRUNCATE feature_types")
+
+ # === Import data ===
+ write("Importing... ")
+ count = 0
+
+ lre = re.compile(r"""^
+ (?P<class>[A-Z0-9])\.
+ (?P<code>[A-Z0-9]*)\t
+ (?P<name>[^\t]*)\t?
+ (?P<desc>.*)
+ $""", re.VERBOSE)
+
+ for line in f:
+ line = line[:-1]
+ m = lre.match(line)
+ if not m:
+ writeErr("Line '%s' could not be parsed!" % line)
+ continue
+
+ cur.execute(
+ "INSERT INTO feature_types (class, code, name, description) VALUES
(%s, %s, %s, %s);",
+ (m.group('class'), m.group('code'), m.group('name'),
m.group('desc'))
+ )
+ count = count + 1
+
+ cur.close()
+ f.close()
+ write("Done (%s records)!\n" % count)
+
+
+def import_geonames_file(filename):
+ global _geonamesDB
+
+ # === Prepare things ===
+ try:
+ f = codecs.open(filename, 'r', 'utf-8')
+ except (IOError):
+ writeErr("Could not open %s for read!" % filename)
+ return
+
+ # --- Clear the table ---
+ write( "Truncating table... " )
+ cur = _geonamesDB.cursor()
+ cur.execute("TRUNCATE geonames;")
+
+ # === Import data ===
+ write("Importing ")
+ count = 0
+
+ lre = re.compile(r"""^
+ (?P<geonameid>[^\t]*)\t
+ (?P<name>[^\t]*)\t
+ (?P<asciiname>[^\t]*)\t
+ (?P<altnames>[^\t]*)\t
+ (?P<lat>[^\t]*)\t
+ (?P<long>[^\t]*)\t
+ (?P<featureClass>[^\t]*)\t
+ (?P<featureCode>[^\t]*)\t
+ (?P<countryCode>[^\t]*)\t
+ (?P<cc2>[^\t]*)\t
+ (?P<admin1>[^\t]*)\t
+ (?P<admin2>[^\t]*)\t
+ (?P<admin3>[^\t]*)\t
+ (?P<admin4>[^\t]*)\t
+ (?P<pop>[^\t]*)\t
+ (?P<elevation>[^\t]*)\t
+ (?P<dem>[^\t]*)\t
+ (?P<tz>[^\t]*)\t
+ (?P<date>[^\t]*)
+ $""", re.VERBOSE)
+
+ for line in f:
+ line = line[:-1]
+ m = lre.match(line)
+ if not m:
+ writeErr("Line '%s' could not be parsed!" % line)
+ continue
+
+ cur.execute("""
+ INSERT INTO geonames (
+ geonameid, name, ascii_name, latitude, longitude,
feature_type_id, country_code,
+ admin1, admin2, admin3, admin4, population, tzid,
last_modification
+ )
+ SELECT %s, %s, %s, %s, %s, ftid, %s, %s, %s, %s, %s, %s, tzid,
%s
+ FROM timezones, feature_types ft WHERE tzname=%s AND
ft.class=%s AND ft.code=%s;
+ """,
+ (
+ int(m.group('geonameid')),
+ m.group('name'),
+ m.group('asciiname'),
+ float(m.group('lat')),
+ float(m.group('long')),
+ m.group('countryCode'),
+ m.group('admin1'),
+ m.group('admin2'),
+ m.group('admin3'),
+ m.group('admin4'),
+ int(m.group('pop')),
+ dateutil.parser.parse(m.group('date')),
+ m.group('tz'),
+ m.group('featureClass'),
+ m.group('featureCode'),
+ )
+ )
+ altnames = m.group('altnames').split(',')
+ if altnames:
+ id = int(m.group('geonameid'))
+ altnamereplace = []
+ for an in altnames:
+ altnamereplace.append((id, an))
+
+ cur.executemany(
+ "INSERT INTO altnames (geonameid, format, altname) VALUES (%s,
'translit', %s);",
+ altnamereplace
+ )
+ count = count + 1
+ if count % 1000 == 0:
+ write(".")
+
+ cur.close()
+ f.close()
+ write("Done (%s records)!\n" % count)
+
+
+def import_altnames_file(filename):
+ global _geonamesDB
+
+ # === Prepare things ===
+ try:
+ f = codecs.open(filename, 'r', 'utf-8')
+ except (IOError):
+ writeErr("Could not open %s for read!" % filename)
+ return
+
+ # --- Clear the table ---
+ write( "Truncating table... " )
+ cur = _geonamesDB.cursor()
+ cur.execute("TRUNCATE altnames;")
+
+ # === Import data ===
+ write("Importing ")
+ count = 0
+
+ for line in f:
+ line = line[:-1]
+ m = line.split("\t")
+ if len(m) != 8:
+ writeErr("Line '%s' could not be parsed! (%d parts)" % (line,
len(m)))
+ continue
+
+ cur.execute("""
+ INSERT INTO altnames (
+ altnameid, geonameid, format, altname, is_preferred,
is_short, is_colloquial, is_historic
+ ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
+ """,
+ (
+ int(m[0]),
+ int(m[1]),
+ m[2],
+ m[3],
+ int(m[4] or 0),
+ int(m[5] or 0),
+ int(m[6] or 0),
+ int(m[7] or 0)
+ )
+ )
+ count = count + 1
+ if count % 1000 == 0:
+ write(".")
+
+ cur.close()
+ f.close()
+ write("Done (%s records)!\n" % count)
+
+
+def write(str):
+ sys.stdout.write(str.encode('utf-8'))
+ sys.stdout.flush()
+
+
+def writeErr(str):
+ sys.stdout.write("%s\n" % str.encode('utf-8'))
+ sys.stdout.flush()
+
+
+if __name__ == "__main__":
+ # === Extract options ===
+ parser = OptionParser(usage="usage: %prog [options] <file root>")
+ parser.add_option("-c", "--config", dest='configFile', default=None,
help='Path to configuration file')
+ parser.add_option('--install-db', dest='installDb', action='store_true',
default=False, help='Install the schema')
+ parser.add_option('--import-tz', dest='importTz', action='store_true',
default=False, help='Import timeZones.txt (must happen before --import-gn)')
+ parser.add_option('--import-f', dest='importF', action='store_true',
default=False, help='Import featureCodes_en.txt (must happen before
--import-gn)')
+ parser.add_option('--import-gn', dest='importGn', action='store_true',
default=False, help='Import allCountries.txt')
+ parser.add_option('--import-alt', dest='importAlt', action='store_true',
default=False, help='Import alternateNames.txt (must happen before
--import-gn)')
+ (options, args) = parser.parse_args()
+
+ if len(args) == 0:
+ filepath = os.getcwd()
+ elif len(args) == 1:
+ filepath = args[1]
+ else:
+ parser.print_usage()
+ exit()
+
+ # === Do some initial setup of useful globals ===
+ # --- Like the configuration file :) ---
+ localdir = os.path.dirname(os.path.abspath(__file__))
+ _config = SafeConfigParser()
+ fileList = ["%s/geonames.cfg" % localdir]
+ if options.configFile is not None:
+ fileList.append(options.configFile)
+ _config.read(fileList)
+
+ # --- And the MySQL connection ---
+ _geonamesDB = MySQL.connect(
+ _config.get('MySQL', 'host'),
+ _config.get('MySQL', 'user'),
+ _config.get('MySQL', 'password'),
+ _config.get('MySQL', 'geonamesDB'),
+ use_unicode=True,
+ charset='utf8'
+ )
+ _geonamesDB.autocommit(True)
+
+ # === Run the requested actions! ===
+ if options.installDb:
+ write("Installing Geonames schema... ")
+ install_schema()
+
+ if options.importTz:
+ write("Importing TimeZone data... ")
+ import_timezone_file("%s/timeZones.txt" % filepath)
+
+ if options.importF:
+ write("Importing feature types... ")
+ import_features_file("%s/featureCodes_en.txt" % filepath)
+
+ if options.importAlt:
+ write("Importing Alternate Names data... ")
+ import_altnames_file("%s/alternateNames.txt" % filepath)
+
+ if options.importGn:
+ write("Importing GeoNames data... ")
+ import_geonames_file("%s/allCountries.txt" % filepath)
+
+ # === Cleanup
+ _geonamesDB.close()
--
To view, visit https://gerrit.wikimedia.org/r/77462
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I985b5e7d86d8b421fdbf7cd55b3b675754989e82
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/fundraising/tools
Gerrit-Branch: master
Gerrit-Owner: Mwalker <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits