ArielGlenn has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/373354 )
Change subject: Add RDF dumps for categories ...................................................................... Add RDF dumps for categories Creates RDF dump for each wiki configured in categories-rdf dblist in other/categoriesrdf/YYMMDD Keeps old dumps back for 70 days. The dumps are kept in TTL formati, e.g.: testwiki.ttl.gz. Bug: T173892 Change-Id: Idc3710f13d2ab03006011850bec98ee168e247c5 --- M modules/dataset/manifests/dirs.pp A modules/snapshot/files/cron/dumpcategoriesrdf.sh A modules/snapshot/files/cron/logrotate.categoriesrdf M modules/snapshot/manifests/cron.pp A modules/snapshot/manifests/cron/categoriesrdf.pp 5 files changed, 191 insertions(+), 0 deletions(-) Approvals: ArielGlenn: Looks good to me, approved jenkins-bot: Verified diff --git a/modules/dataset/manifests/dirs.pp b/modules/dataset/manifests/dirs.pp index 4cdfcb2..3a4831c 100644 --- a/modules/dataset/manifests/dirs.pp +++ b/modules/dataset/manifests/dirs.pp @@ -16,6 +16,7 @@ $medialistsdir = "${otherdir}/imageinfo" $pagetitlesdir = "${otherdir}/pagetitles" $mediatitlesdir = "${otherdir}/mediatitles" + $categoriesrdf = "${otherdir}/categoriesrdf" file { $datadir: ensure => 'directory', @@ -129,4 +130,11 @@ owner => 'datasets', group => 'datasets', } + + file { $categoriesrdf: + ensure => 'directory', + mode => '0755', + owner => 'datasets', + group => 'datasets', + } } diff --git a/modules/snapshot/files/cron/dumpcategoriesrdf.sh b/modules/snapshot/files/cron/dumpcategoriesrdf.sh new file mode 100755 index 0000000..3153aa9 --- /dev/null +++ b/modules/snapshot/files/cron/dumpcategoriesrdf.sh @@ -0,0 +1,134 @@ +#!/bin/bash +############################################################# +# This file is maintained by puppet! +# modules/snapshot/cron/dumpcategoriesrdf.sh +############################################################# +# +# Generate an RDF dump of categories for all wikis in +# categories-rdf list and remove old ones. + +source /usr/local/etc/set_dump_dirs.sh + +usage() { + echo "Usage: $0 --list wikis.dblist [--config <pathtofile>] [--dryrun]" + echo + echo " --config path to configuration file for dump generation" + echo " (default value: ${confsdir}/wikidump.conf" + echo " --list file containing list of the wikis to dump" + echo " --dryrun don't run dump, show what would have been done" + exit 1 +} + +configFile="${confsdir}/wikidump.conf" +dryrun="false" +dumpFormat="ttl" +dbList="categories-rdf" + +while [ $# -gt 0 ]; do + if [ $1 == "--config" ]; then + configFile="$2" + shift; shift; + elif [ $1 == "--dryrun" ]; then + dryrun="true" + shift + elif [ $1 == "--list" ]; then + dbList="$2" + shift; shift; + else + echo "$0: Unknown option $1" + usage + fi +done + +if [ -z "$dbList" -o ! -f "$dbList" ]; then + echo "Valid wiki list must be specified" + echo "Exiting..." + exit 1 +fi + +if [ ! -f "$configFile" ]; then + echo "Could not find config file: $configFile" + echo "Exiting..." + exit 1 +fi + +deployDir=$(egrep "^dir=" "$configFile" | mawk -Fdir= '{ print $2 }') +gzip=$(egrep "^gzip=" "$configFile" | mawk -Fgzip= '{ print $2 }') +privateList=$(egrep "^privatelist=" "$configFile" | mawk -Fprivatelist= '{ print $2 }') +publicDir=$(egrep "^public=" "$configFile" | mawk -Fpublic= '{ print $2 }') + +if [ -z "$deployDir" -o -z "$gzip" -o -z "$privateList" -o -z "$publicDir" ]; then + echo "failed to find value of one of the following from config file $configFile:" + echo "gzip: $gzip" + echo "dir: $deployDir" + echo "privatelist: $privateList" + echo "public: $publicDir" + echo "exiting..." + exit 1 +fi + +today=$(date +'%Y%m%d') +targetDirBase="${publicDir}/other/categoriesrdf" +targetDir="${targetDirBase}/${today}" +timestampsDir="${targetDirBase}/lastdump" +multiVersionScript="${deployDir}/multiversion/MWScript.php" + +# remove old datasets +daysToKeep=70 +cutOff=$(( $(date +%s) - $(( $daysToKeep + 1 )) * 24 * 3600)) +if [ -d "$targetDirBase" ]; then + for folder in $(ls -d -r "${targetDirBase}/"*); do + creationTime=$(date --utc --date="$(basename $folder)" +%s 2>/dev/null) + if [ -n "$creationTime" ]; then + if [ "$cutOff" -gt "$creationTime" ]; then + if [ "$dryrun" == "true" ]; then + echo rm "${folder}/"*".${dumpFormat}.gz" + echo rmdir "${folder}" + else + rm -f "${folder}/"*".${dumpFormat}.gz" + rmdir "${folder}" + fi + fi + fi + done +fi + +# create todays folder +if [ "$dryrun" == "true" ]; then + echo mkdir -p "$targetDir" + echo mkdir -p "$timestampsDir" +else + if ! mkdir -p "$targetDir"; then + echo "Can't make output directory: $targetDir" + echo "Exiting..." + exit 1 + fi + if ! mkdir -p "$timestampsDir"; then + echo "Can't make output directory: $timestampsDir" + echo "Exiting..." + exit 1 + fi +fi + +# iterate over configured wikis +cat "$dbList" | while read wiki; do + # exclude all private wikis + if ! egrep -q "^${wiki}$" "$privateList"; then + filename="${wiki}-${today}-categories" + targetFile="${targetDir}/${filename}.${dumpFormat}.gz" + tsFile="${timestampsDir}/${wiki}-categories.last" + if [ "$dryrun" == "true" ]; then + echo "php $multiVersionScript maintenance/dumpCategoriesAsRdf.php --wiki=$wiki --format=$dumpFormat 2> /var/log/categoriesrdf/${filename}.log | $gzip > $targetFile" + else + php "$multiVersionScript" maintenance/dumpCategoriesAsRdf.php --wiki="$wiki" --format="$dumpFormat" 2> "/var/log/categoriesrdf/${filename}.log" | "$gzip" > "$targetFile" + echo "$today" > "$tsFile" + fi + fi +done + + +# Maintain a 'latest' symlink always pointing at the most recently completed dump +if [ "$dryrun" == "false" ]; then + cd "$targetDirBase" + ln -snf "$today" "latest" +fi diff --git a/modules/snapshot/files/cron/logrotate.categoriesrdf b/modules/snapshot/files/cron/logrotate.categoriesrdf new file mode 100644 index 0000000..af0cac5 --- /dev/null +++ b/modules/snapshot/files/cron/logrotate.categoriesrdf @@ -0,0 +1,11 @@ +# This file is managed by puppet +# puppet:///modules/snapshot/cron/logrotate.categoriesrdf +# +/var/log/categoriesrdf/*.log { + daily + compress + delaycompress + missingok + maxage 22 + nocreate +} diff --git a/modules/snapshot/manifests/cron.pp b/modules/snapshot/manifests/cron.pp index 9e8441e..c5a25a6 100644 --- a/modules/snapshot/manifests/cron.pp +++ b/modules/snapshot/manifests/cron.pp @@ -4,6 +4,7 @@ class { '::snapshot::cron::mediaperprojectlists': user => $user } class { '::snapshot::cron::pagetitles': user => $user } class { '::snapshot::cron::cirrussearch': user => $user } + class { '::snapshot::cron::categoriesrdf': user => $user } class { '::snapshot::cron::dumplists': user => $user } class { '::snapshot::cron::dump_global_blocks': user => $user } class { '::snapshot::cron::wikidatadumps::json': user => $user } diff --git a/modules/snapshot/manifests/cron/categoriesrdf.pp b/modules/snapshot/manifests/cron/categoriesrdf.pp new file mode 100644 index 0000000..8312683 --- /dev/null +++ b/modules/snapshot/manifests/cron/categoriesrdf.pp @@ -0,0 +1,37 @@ +class snapshot::cron::categoriesrdf( + $user = undef, +) { + $confsdir = $snapshot::dumps::dirs::confsdir + $apachedir = $snapshot::dumps::dirs::apachedir + + file { '/var/log/categoriesrdf': + ensure => 'directory', + mode => '0644', + owner => $user, + } + + logrotate::conf { 'categoriesrdf': + ensure => present, + source => 'puppet:///modules/snapshot/cron/logrotate.categoriesrdf', + } + + $scriptpath = '/usr/local/bin/dumpcategoriesrdf.sh' + file { $scriptpath: + mode => '0755', + owner => 'root', + group => 'root', + source => 'puppet:///modules/snapshot/cron/dumpcategoriesrdf.sh', + } + + cron { 'categoriesrdf-dump': + ensure => 'present', + command => "${scriptpath} --config ${confsdir}/wikidump.conf --list ${apachedir}/dblists/categories-rdf.dblist", + environment => 'MAILTO=ops-du...@wikimedia.org', + user => $user, + minute => '0', + hour => '20', + weekday => '6', + require => File[$scriptpath], + } +} + -- To view, visit https://gerrit.wikimedia.org/r/373354 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Idc3710f13d2ab03006011850bec98ee168e247c5 Gerrit-PatchSet: 11 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: ArielGlenn <ar...@wikimedia.org> Gerrit-Reviewer: Gehel <guillaume.leder...@wikimedia.org> Gerrit-Reviewer: Hoo man <h...@online.de> Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits