Gehel has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/399954 )
Change subject: Add loading DCAT-AP data into dcatap namespace on WDQS ...................................................................... Add loading DCAT-AP data into dcatap namespace on WDQS Bug: T178978 Change-Id: Ie1d7eb7567d1409fb1116a71b23b4d5160aa2c7b --- A modules/wdqs/files/cron/cronUtils.sh M modules/wdqs/files/cron/reloadCategories.sh A modules/wdqs/files/cron/reloadDCAT-AP.sh M modules/wdqs/manifests/gui.pp 4 files changed, 118 insertions(+), 31 deletions(-) Approvals: jenkins-bot: Verified Gehel: Looks good to me, approved diff --git a/modules/wdqs/files/cron/cronUtils.sh b/modules/wdqs/files/cron/cronUtils.sh new file mode 100755 index 0000000..4d2487a --- /dev/null +++ b/modules/wdqs/files/cron/cronUtils.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Cron helper functions +if [ -r /etc/wdqs/vars.sh ]; then + . /etc/wdqs/vars.sh +fi + +if [ -r /etc/wdqs/gui_vars.sh ]; then + . /etc/wdqs/gui_vars.sh +fi + +if [ -z "${DATA_DIR}" -o -z "${LOG_DIR}" -o -z "${DEPLOY_DIR}" ]; then + echo "Variables not set up right!" + exit 1 +fi + +HOST="http://localhost:9999" +NAMESPACE_URL="/bigdata/namespace/" +today=$(date -u +'%Y%m%d') + +function loadFileIntoBlazegraph { + # source URL + local URL=$1 + # local filename (will be in DATA_DIR) + local fileName=$2 + local sparqlEndpoint=$3 + curl -s -f -XGET $URL -o ${DATA_DIR}/${fileName} + if [ ! -s ${DATA_DIR}/${fileName} ]; then + echo "Could not download $URL into ${fileName}" + exit 1 + fi + curl -s -XPOST --data-binary update="LOAD <file://$DATA_DIR/$FILENAME>" $sparqlEndpoint +} + +# NOTE: This should be run under user that has rights to +# sudo systemctl reload nginx +function replaceNamespace { + local mainName=$1 + local currentAlias=$2 + local oldNamespace=$(cat $ALIAS_FILE | grep $mainName | cut -d' ' -f2 | cut -d ';' -f1) + if [ "${oldNamespace}" = ${currentAlias} ]; then + # nothing to do + return + fi + if [ -n "${oldNamespace}" ]; then + sed -i "/${mainName}/c ${mainName} ${currentAlias};" $ALIAS_FILE + else + echo "${mainName} ${currentAlias};" >> $ALIAS_FILE + fi + # Bump nginx to reload config + sudo systemctl reload nginx + if [ -n "${oldNamespace}" ]; then + # Drop old namespace + curl -s -X DELETE "${HOST}${NAMESPACE_URL}${oldNamespace}" + fi +} \ No newline at end of file diff --git a/modules/wdqs/files/cron/reloadCategories.sh b/modules/wdqs/files/cron/reloadCategories.sh index 33b83e9..2710a19 100755 --- a/modules/wdqs/files/cron/reloadCategories.sh +++ b/modules/wdqs/files/cron/reloadCategories.sh @@ -2,35 +2,14 @@ # This script is reloading categories into a new namespace # NOTE: This should be run under user that has rights to # sudo systemctl reload nginx -if [ -r /etc/wdqs/vars.sh ]; then - . /etc/wdqs/vars.sh -fi +. /usr/local/bin/cronUtils.sh -if [ -r /etc/wdqs/gui_vars.sh ]; then - . /etc/wdqs/gui_vars.sh -fi - -if [ -z "${DATA_DIR}" -o -z "${LOG_DIR}" -o -z "${DEPLOY_DIR}" ]; then - echo "Variables not set up right!" - exit 1 -fi - -today=$(date -u +'%Y%m%d') newNamespace="categories${today}" # Drop old dumps rm -f ${DATA_DIR}/*-categories.ttl.gz cd $DEPLOY_DIR +# Create new namespace bash createNamespace.sh $newNamespace || exit 1 # Load the data bash forAllCategoryWikis.sh loadCategoryDump.sh $newNamespace >> "${LOG_DIR}/${newNamespace}.log" -# Get old namespace -oldNamespace=$(cat $ALIAS_FILE | grep categories | cut -d' ' -f2 | cut -d ';' -f1) -# Switch the map -# NOTE: right now it overrides the map. If we reuse it for other purposes, this needs to be made smarter. -echo "categories ${newNamespace};" > $ALIAS_FILE -# Bump nginx to reload config -sudo systemctl reload nginx -if [ -n "${oldNamespace}" ]; then - # Drop old namespace - curl -s -X DELETE "http://localhost:9999/bigdata/namespace/${oldNamespace}" -fi +replaceNamespace categories $newNamespace diff --git a/modules/wdqs/files/cron/reloadDCAT-AP.sh b/modules/wdqs/files/cron/reloadDCAT-AP.sh new file mode 100755 index 0000000..618bddc --- /dev/null +++ b/modules/wdqs/files/cron/reloadDCAT-AP.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# This script is reloading DCAT-AP data from Wikidata +# NOTE: This should be run under user that has rights to +# sudo systemctl reload nginx +. /usr/local/bin/cronUtils.sh + +DCAT_SOURCE=${DCAT_SOURCE:-"https://dumps.wikimedia.org/wikidatawiki/entities/dcatap.rdf"} + +newNamespace="dcatap${today}" +# Drop old dumps +rm -f ${DATA_DIR}/dcatap-*.rdf +cd $DEPLOY_DIR +# Create new NS +bash createNamespace.sh $newNamespace || exit 1 +# Load the data +FILENAME=dcatap-${today}.rdf +loadFileIntoBlazegraph $DCAT_SOURCE $FILENAME "${HOST}${NAMESPACE_URL}${newNamespace}/sparql" +replaceNamespace dcatap $newNamespace diff --git a/modules/wdqs/manifests/gui.pp b/modules/wdqs/manifests/gui.pp index 458e38e..d056705 100644 --- a/modules/wdqs/manifests/gui.pp +++ b/modules/wdqs/manifests/gui.pp @@ -32,7 +32,7 @@ } # List of namespace aliases in format: - # ALIAS REAL_NAME + # ALIAS REAL_NAME; # This map is generated manually or by category update script file { $alias_map: ensure => present, @@ -59,13 +59,28 @@ mode => '0644', } + file { '/usr/local/bin/cronUtils.sh': + ensure => present, + source => 'puppet:///modules/wdqs/cron/cronUtils.sh', + owner => 'root', + group => 'root', + mode => '0755', + } + file { '/usr/local/bin/reloadCategories.sh': - ensure => present, - source => 'puppet:///modules/wdqs/cron/reloadCategories.sh', - owner => 'root', - group => 'root', - mode => '0755', - require => File['/etc/wdqs/gui_vars.sh'], + ensure => present, + source => 'puppet:///modules/wdqs/cron/reloadCategories.sh', + owner => 'root', + group => 'root', + mode => '0755', + } + + file { '/usr/local/bin/reloadDCAT-AP.sh': + ensure => present, + source => 'puppet:///modules/wdqs/cron/reloadDCAT-AP.sh', + owner => 'root', + group => 'root', + mode => '0755', } $cron_log = "${log_dir}/reloadCategories.log" @@ -88,6 +103,15 @@ hour => fqdn_rand(2), } + cron { 'reload-dcatap': + ensure => present, + command => "/usr/local/bin/reloadDCAT-AP.sh >> ${log_dir}/dcat.log", + user => $username, + weekday => 4, + minute => 0, + hour => 10, + } + logrotate::rule { 'wdqs-reload-categories': ensure => present, file_glob => $cron_log, @@ -99,6 +123,17 @@ su => "${username} wikidev", } + logrotate::rule { 'wdqs-reload-dcat': + ensure => present, + file_glob => "${log_dir}/dcatap.log", + frequency => 'monthly', + missing_ok => true, + not_if_empty => true, + rotate => 3, + compress => true, + su => "${username} wikidev", + } + # Remove categories*.log files after 30 days. logrotate::rule { 'wdqs-categories-logs': ensure => present, -- To view, visit https://gerrit.wikimedia.org/r/399954 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ie1d7eb7567d1409fb1116a71b23b4d5160aa2c7b Gerrit-PatchSet: 7 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: Gehel <guillaume.leder...@wikimedia.org> Gerrit-Reviewer: Lokal Profil <lokal.pro...@gmail.com> Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits