Gehel has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/378245 )
Change subject: [WIP] maps: move to vector tiles and cleartables ...................................................................... [WIP] maps: move to vector tiles and cleartables * new OSM update script * new database name Bug: T157613 Change-Id: Ida723dee967cffb4ed44b46b14f7257f0a146ffd --- A hieradata/role/common/maps/test/vectortiles_master.yaml A modules/osm/files/process-osm-data.sh A modules/osm/manifests/cleartables_sync.pp A modules/osm/manifests/meddo.pp M modules/profile/manifests/maps/osm_master.pp M modules/profile/templates/maps/grants-gis.sql.erb A modules/role/manifests/maps/test/vectortiles_master.pp 7 files changed, 456 insertions(+), 19 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/45/378245/1 diff --git a/hieradata/role/common/maps/test/vectortiles_master.yaml b/hieradata/role/common/maps/test/vectortiles_master.yaml new file mode 100644 index 0000000..6833913 --- /dev/null +++ b/hieradata/role/common/maps/test/vectortiles_master.yaml @@ -0,0 +1,32 @@ +# the following passwords are defined in private repo: +#profile::maps::osm_master::kartotherian_pass: some_password +#profile::maps::osm_master::tilerator_pass: some_password +#profile::maps::osm_master::tileratorui_pass: some_password +#profile::maps::osm_master::osmimporter_pass: some_password +#profile::maps::osm_master::osmupdater_pass: some_password +#profile::maps::osm_master::replication_pass: some_password +#profile::maps::cassandra::kartotherian_pass: some_password +#profile::maps::cassandra::tilerator_pass: some_password +#profile::maps::cassandra::tileratorui_pass: some_password +#profile::cassandra::single_instance::super_pass: some_password + +admin::groups: + - maps-admins + - kartotherian-admin + - tilerator-admin + +cluster: 'maps-test-vector' + +profile::cassandra::single_instance::graphite_host: 'graphite-in.eqiad.wmnet' +service::configuration::statsd_host: 'statsd.eqiad.wmnet' + +cassandra::metrics::blacklist: + - .*\.metrics\.Table\..*$ + +profile::redis::master::instances: ['6379'] +profile::redis::master::settings: + bind: "0.0.0.0" + +profile::maps::postgresql_common::shared_buffers: '4GB' +profile::maps::postgresql_common::maintenance_work_mem: '3GB' +profile::maps::osm_master::cleartables: true diff --git a/modules/osm/files/process-osm-data.sh b/modules/osm/files/process-osm-data.sh new file mode 100644 index 0000000..db79fea --- /dev/null +++ b/modules/osm/files/process-osm-data.sh @@ -0,0 +1,283 @@ +#!/usr/bin/env bash + +# This script will +# 1. Download OSM data and load it into a DB +# 2. Update that DB +# 3. Keep a local copy of the planet up to date + +# Requirements +# - osmium-tool +# - osmosis +# - osm2pgsql +# - ClearTables +# - meddo + +set -e + +CREATE_DB=false + +BASE_DIR="/srv/osm_replication" +CLEARTABLES="/srv/deployment/tilerator/deploy/ClearTables" +MEDDO="/srv/deployment/tilerator/deploy/meddo" + +DATABASE="ct" + +PLANET_DIR="$BASE_DIR/planet" +PLANET_REPLICATION_BASE="$PLANET_DIR/planet-replication" +DATABASE_REPLICATION_BASE="$PLANET_DIR/database-replication" + +# -E 3857 is not required on newer versions of osm2pgsql +osm2pgsql_common_opts="-E 3857 --flat-nodes $PLANET_DIR/nodes.bin" +osm2pgsql_import_opts="--cache 10000 --number-processes 2" +osm2pgsql_update_opts="--cache 1000 --number-processes 1" + +function show_setup_help() { + cat << EOF +Usage: ${0##*/} setup data_url state_url replication_url + +Examples: + ${0##*/} setup http://download.geofabrik.de/north-america/canada/british-columbia-170101.osm.pbf \\ + http://download.geofabrik.de/north-america/canada/british-columbia-updates/000/001/384.state.txt \\ + http://download.geofabrik.de/north-america/canada/british-columbia-updates + +EOF +exit 1 +} + +function setup_data() { + if [ -z "$1" ]; then + echo "data_url not set" + show_setup_help + exit 0 + fi + if [ -z "$2" ]; then + echo "state_url not set" + show_setup_help + exit 0 + fi + if [ -z "$3" ]; then + echo "replication_url not set" + show_setup_help + exit 0 + fi + PLANET_URL="$1" + STATE_URL="$2" + REPLICATION_BASE="$3" + + mkdir -p "$PLANET_DIR" + mkdir -p "$PLANET_REPLICATION_BASE" + + cat <<EOF > "$PLANET_REPLICATION_BASE/configuration.txt" +# The URL of the directory containing change files. +baseUrl=$REPLICATION_BASE + +# Allow 3 days of downloads +maxInterval = 259200 +EOF + + echo "Downloading files" + curl --retry 5 -o "$PLANET_DIR/osm-data.osm.pbf" "$PLANET_URL" + curl --retry 5 -o "$PLANET_REPLICATION_BASE/state.txt" "$STATE_URL" + + # Call a function here to update the planet later +} + +function onplanetupdateexit { + [ -f "$PLANET_REPLICATION_BASE/state-prev.txt" ] && mv "$PLANET_REPLICATION_BASE/state-prev.txt" "$PLANET_REPLICATION_BASE/state.txt" +} + +function load_borders() { + echo "Loading borders" + psql -d ct -v ON_ERROR_STOP=1 -Xq <<EOF +CREATE SCHEMA IF NOT EXISTS loading; +DROP TABLE IF EXISTS loading.osmborder_lines; +CREATE TABLE loading.osmborder_lines ( + osm_id bigint, + admin_level int, + dividing_line bool, + disputed bool, + maritime bool, + way Geometry(LineString, 3857)); +\copy loading.osmborder_lines FROM $PLANET_DIR/osmborder_lines.csv +CREATE INDEX osmborder_lines_way_idx ON loading.osmborder_lines USING gist (way) WITH (fillfactor=100); +CLUSTER loading.osmborder_lines USING osmborder_lines_way_idx; +CREATE INDEX osmborder_lines_way_low_idx ON loading.osmborder_lines USING gist (way) WITH (fillfactor=100) WHERE admin_level <= 4; +ANALYZE loading.osmborder_lines; +BEGIN; +DROP TABLE IF EXISTS public.osmborder_lines; +ALTER TABLE loading.osmborder_lines SET SCHEMA public; +COMMIT; +EOF +} + +function planet_update() { + pushd "$PLANET_REPLICATION_BASE" + trap onplanetupdateexit EXIT + set -e + cp state.txt state-prev.txt + # Clean up from any previous runs + rm -f "$PLANET_DIR/changes.osc" + osmosis --read-replication-interval --write-xml-change file="$PLANET_DIR/changes.osc" + osmium apply-changes -v --fsync "$PLANET_DIR/osm-data.osm.pbf" "$PLANET_DIR/changes.osc" -o "$PLANET_DIR/osm-data-new.osm.pbf" + mv "$PLANET_DIR/osm-data-new.osm.pbf" "$PLANET_DIR/osm-data.osm.pbf" + # File is updated, clean up derived files + rm -f "$PLANET_DIR/changes.osc" "$PLANET_DIR/osm-filtered.osm.pbf" "$PLANET_DIR/osmborder_lines.csv" + osmborder_filter -o "$PLANET_DIR/osm-filtered.osm.pbf" "$PLANET_DIR/osm-data.osm.pbf" + osmborder -o "$PLANET_DIR/osmborder_lines.csv" "$PLANET_DIR/osm-filtered.osm.pbf" + load_borders + rm state-prev.txt +} + +function create_database() { + if [ $CREATE_DB = true ] ; then + createdb $DATABASE + psql -1Xq -d $DATABASE -c 'CREATE EXTENSION postgis; CREATE EXTENSION hstore;' + # Meddo needs these extensions above and beyond what ClearTables needs + psql -1Xq -d $DATABASE -c 'CREATE EXTENSION unaccent; CREATE EXTENSION fuzzystrmatch;' + fi + psql -d $DATABASE -f "$MEDDO/functions.sql" +} + +function import_data() { + create_database + + # Snapshot the current state + cp -r "$PLANET_REPLICATION_BASE" "$DATABASE_REPLICATION_BASE" + + # https://github.com/openstreetmap/osm2pgsql/issues/321 requires switching directories + pushd "$CLEARTABLES" + + # Build the ClearTables files + cat cleartables.yaml wikidata.yaml | ./yaml2json.py > "${BASE_DIR}/cleartables.json" + cat cleartables.yaml wikidata.yaml | ./createcomments.py > "${BASE_DIR}/comments.sql" + + cat sql/types/*.sql | psql -1Xq -d $DATABASE + + osm2pgsql $osm2pgsql_common_opts $osm2pgsql_import_opts --create --slim \ + -d $DATABASE --output multi --style ${BASE_DIR}/cleartables.json \ + -G "$PLANET_DIR/osm-data.osm.pbf" + cat sql/post/*.sql ${BASE_DIR}/comments.sql | psql -1Xq -d $DATABASE + popd +} + +function static_update() { + # Quite a simple function thanks to Meddo's scripts + pushd "$MEDDO" + "$MEDDO/get-external-data.py" + popd +} +function onupdateexit { + [ -f "$DATABASE_REPLICATION_BASE/state-prev.txt" ] && mv "$DATABASE_REPLICATION_BASE/state-prev.txt" "$DATABASE_REPLICATION_BASE/state.txt" +} + +function database_update() { +# see https://github.com/openstreetmap/chef/blob/master/cookbooks/tile/templates/default/replicate.erb for another example +# The OSMF example is a daemon with a while true loop, this is a one-shot script, but they both do the same task + pushd "$DATABASE_REPLICATION_BASE" + + trap onupdateexit EXIT + . state.txt + cp state.txt state-prev.txt + file="$PWD/changes-${sequenceNumber}.osm.gz" + osmosis --read-replication-interval --write-xml-change file="${file}" compressionMethod="gzip" + + prevSequenceNumber=$sequenceNumber + . state.txt + if [ "${sequenceNumber}" == "${prevSequenceNumber}" ] + then + echo "No new data available. Sleeping..." + # Remove file, it will just be an empty changeset + rm ${file} + # No need to rollback now + rm state-prev.txt + exit 0 + else + echo "Fetched new data from ${prevSequenceNumber} to ${sequenceNumber} into ${file}" + + # https://github.com/openstreetmap/osm2pgsql/issues/321 requires switching directories + pushd "$CLEARTABLES" + make + osm2pgsql $osm2pgsql_common_opts $osm2pgsql_update_opts --append --slim \ + -d $DATABASE --output multi --style cleartables.json \ + -G ${file} + + # Something should be done to create expire lists and process them + popd + + rm state-prev.txt + + # expire tiles + + find . -name 'changes-*.gz' -mmin +300 -exec rm -f {} \; + fi + popd +} + + +function clean () { + if [ "$really" != "yes" ]; then + echo "This will delete downloaded files and drop the database. If you really want to do this, set the enviornment variable \"really\" to yes" + exit 1 + fi + + rm -rf "$PLANET_DIR" + dropdb $DATABASE +} + +function show_help() { + cat << EOF +Usage: ${0##*/} mode + +Modes: + setup: Downloads initial data, updates it, and sets up replication (see setup --help for more info) + import: Import the data with osm2pgsql + static-update: Update the static data tables + planet-update: Update the planet file and regenerate borders + database-update: Update the database + clean: Clean everything up + +EOF +} + +if [ "$#" == "0" ]; then + show_help + exit 1 +fi + +command="$1" + +case "$command" in + setup) + shift + setup_data $@ + ;; + + import) + shift + import_data + ;; + + static-update) + shift + static_update + ;; + + planet-update) + shift + planet_update + ;; + + database-update) + shift + database_update + ;; + + clean) + shift + clean + ;; + + *) + show_help + ;; +esac diff --git a/modules/osm/manifests/cleartables_sync.pp b/modules/osm/manifests/cleartables_sync.pp new file mode 100644 index 0000000..384c29c --- /dev/null +++ b/modules/osm/manifests/cleartables_sync.pp @@ -0,0 +1,59 @@ +define osm::cleartables_sync ( + $pg_password, + $ensure = 'present', + $hour = '*', + $minute = '*/30', + $postreplicate_command = undef, + $proxy='webproxy.eqiad.wmnet:8080', +) { + + $log_dir = '/var/log/osm_replication/' + + include ::osm::meddo + include ::osm::users + + file { '/usr/local/bin/process-osm-data': + ensure => present, + owner => 'root', + group => 'root', + mode => '0555', + source => 'puppet:///modules/osm/process-osm-data.sh', + } + + file { [ $log_dir, '/srv/osm_replication' ]: + ensure => directory, + owner => 'osmupdater', + group => 'osmupdater', + mode => '0755', + } + + logrotate::rule {'cleartables-sync': + ensure => present, + frequency => 'daily', + max_age => 30, + rotate => 5, + date_ext => true, + compress => true, + missing_ok => true, + no_create => true, + } + + $base_cron_command = "/usr/local/bin/process-osm-data planet-update >> ${log_dir}/planet-update.log 2>&1" + $cron_command = $postreplicate_command ? { + undef => $base_cron_command, + default => "${base_cron_command} ; ${postreplicate_command} >> ${log_dir}/planet-update.log 2>&1" + } + + cron { "planet_sync-${name}": + ensure => $ensure, + command => $cron_command, + user => 'osmupdater', + hour => $hour, + minute => $minute, + environment => [ + "PGPASSWORD=${pg_password}", + "https_proxy=https://${proxy}", + ], + } + +} diff --git a/modules/osm/manifests/meddo.pp b/modules/osm/manifests/meddo.pp new file mode 100644 index 0000000..3252050 --- /dev/null +++ b/modules/osm/manifests/meddo.pp @@ -0,0 +1,19 @@ +# = Class: osm::meddo +# +# Meddo is a tool to manage OSM datasources. +# See https://github.com/kartotherian/meddo for details. +# +# At this point, meddo is very much related to all our OSM data import. +# Depending on how it evolves, it might make sense to move this to its own +# module at some point. +# +# == Parameters: +class osm::meddo { + # make sure all dependencies for meddo are present + require_package([ + 'python3-requests', + 'python3-psycopg2', + 'python3-yaml', + 'gdal-bin', + ]) +} diff --git a/modules/profile/manifests/maps/osm_master.pp b/modules/profile/manifests/maps/osm_master.pp index c51b5ef..41291f4 100644 --- a/modules/profile/manifests/maps/osm_master.pp +++ b/modules/profile/manifests/maps/osm_master.pp @@ -10,11 +10,17 @@ $osmupdater_pass = hiera('profile::maps::osm_master::osmupdater_pass'), $replication_pass = hiera('profile::maps::osm_master::replication_pass'), $postgres_slaves = hiera('profile::maps::osm_master::slaves', undef), + $cleartables = hiera('profile::maps::osm_master::cleartables', false), ) { + + require ::profile::maps::postgresql_common $maps_hosts_ferm = join($maps_hosts, ' ') - require ::profile::maps::postgresql_common + $db_name = $cleartables ? { + true => 'ct', + default => 'gis', + } class { '::postgresql::master': pgversion => '9.4', @@ -30,22 +36,22 @@ postgresql::user { 'kartotherian': user => 'kartotherian', password => $kartotherian_pass, - database => 'gis', + database => $db_name, } postgresql::user { 'tileratorui': user => 'tileratorui', password => $tileratorui_pass, - database => 'gis', + database => $db_name, } postgresql::user { 'osmimporter': user => 'osmimporter', password => $osmimporter_pass, - database => 'gis', + database => $db_name, } postgresql::user { 'osmupdater': user => 'osmupdater', password => $osmupdater_pass, - database => 'gis', + database => $db_name, } profile::maps::tilerator_user { 'localhost': @@ -62,7 +68,7 @@ } # Grants - file { '/usr/local/bin/maps-grants-gis.sql': + file { "/usr/local/bin/maps-grants-${db_name}.sql": owner => 'root', group => 'root', mode => '0400', @@ -76,8 +82,19 @@ } # DB setup - postgresql::spatialdb { 'gis': } - + postgresql::spatialdb { $db_name: } + if $cleartables { + postgresql::db::extension { "${title}-fuzzystrmatch": + database => $db_name, + extname => 'fuzzystrmatch', + require => Postgresql::Db[$db_name], + } + postgresql::db::extension { "${title}-unaccent": + database => $db_name, + extname => 'unaccent', + require => Postgresql::Db[$db_name], + } + } # some additional logging for the postgres master to help diagnose import # performance issues @@ -111,16 +128,26 @@ ], } - osm::planet_sync { 'gis': - ensure => present, - flat_nodes => true, - expire_levels => '15', - num_threads => 4, - pg_password => $osmupdater_pass, - period => $planet_sync_period, - hour => $planet_sync_hour, - minute => $planet_sync_minute, - postreplicate_command => 'sudo -u tileratorui /usr/local/bin/notify-tilerator', + if $cleartables { + osm::cleartables_sync { $db_name: + ensure => present, + pg_password => $osmupdater_pass, + hour => $planet_sync_hour, + minute => $planet_sync_minute, + postreplicate_command => 'sudo -u tileratorui /usr/local/bin/notify-tilerator', + } + } else { + osm::planet_sync { $db_name: + ensure => present, + flat_nodes => true, + expire_levels => '15', + num_threads => 4, + pg_password => $osmupdater_pass, + period => $planet_sync_period, + hour => $planet_sync_hour, + minute => $planet_sync_minute, + postreplicate_command => 'sudo -u tileratorui /usr/local/bin/notify-tilerator', + } } class { 'osm::prometheus': diff --git a/modules/profile/templates/maps/grants-gis.sql.erb b/modules/profile/templates/maps/grants-gis.sql.erb index b776b4c..8e1352b 100644 --- a/modules/profile/templates/maps/grants-gis.sql.erb +++ b/modules/profile/templates/maps/grants-gis.sql.erb @@ -9,7 +9,7 @@ GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO osmimporter; ALTER DEFAULT PRIVILEGES GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO osmimporter; -GRANT CREATE, TEMPORARY ON DATABASE gis TO osmimporter; +GRANT CREATE, TEMPORARY ON DATABASE <%= @db_name %> TO osmimporter; GRANT USAGE, SELECT, UPDATE ON ALL SEQUENCES IN SCHEMA public to osmimporter; GRANT USAGE, SELECT, UPDATE ON ALL SEQUENCES IN SCHEMA public to osmupdater; diff --git a/modules/role/manifests/maps/test/vectortiles_master.pp b/modules/role/manifests/maps/test/vectortiles_master.pp new file mode 100644 index 0000000..f84f269 --- /dev/null +++ b/modules/role/manifests/maps/test/vectortiles_master.pp @@ -0,0 +1,17 @@ +# Sets up a maps server master +class role::maps::test::vectortiles_master { + include ::standard + include ::base::firewall + + include ::profile::maps::apps + include ::profile::maps::cassandra + include ::profile::maps::osm_master + include ::profile::redis::master + + system::role { 'role::maps::test::master': + ensure => 'present', + description => 'Maps master with vector tiles (postgresql, cassandra, redis, tilerator, kartotherian)', + } + +} + -- To view, visit https://gerrit.wikimedia.org/r/378245 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ida723dee967cffb4ed44b46b14f7257f0a146ffd Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Gehel <guillaume.leder...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits