ArielGlenn has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/373354 )

Change subject: Add RDF dumps for categories
......................................................................


Add RDF dumps for categories

Creates RDF dump for each wiki configured in categories-rdf dblist
in other/categoriesrdf/YYMMDD
Keeps old dumps back for 70 days.
The dumps are kept in TTL formati, e.g.: testwiki.ttl.gz.

Bug: T173892
Change-Id: Idc3710f13d2ab03006011850bec98ee168e247c5
---
M modules/dataset/manifests/dirs.pp
A modules/snapshot/files/cron/dumpcategoriesrdf.sh
A modules/snapshot/files/cron/logrotate.categoriesrdf
M modules/snapshot/manifests/cron.pp
A modules/snapshot/manifests/cron/categoriesrdf.pp
5 files changed, 191 insertions(+), 0 deletions(-)

Approvals:
  ArielGlenn: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/modules/dataset/manifests/dirs.pp 
b/modules/dataset/manifests/dirs.pp
index 4cdfcb2..3a4831c 100644
--- a/modules/dataset/manifests/dirs.pp
+++ b/modules/dataset/manifests/dirs.pp
@@ -16,6 +16,7 @@
     $medialistsdir            = "${otherdir}/imageinfo"
     $pagetitlesdir            = "${otherdir}/pagetitles"
     $mediatitlesdir           = "${otherdir}/mediatitles"
+    $categoriesrdf            = "${otherdir}/categoriesrdf"
 
     file { $datadir:
         ensure => 'directory',
@@ -129,4 +130,11 @@
         owner  => 'datasets',
         group  => 'datasets',
     }
+
+    file { $categoriesrdf:
+        ensure => 'directory',
+        mode   => '0755',
+        owner  => 'datasets',
+        group  => 'datasets',
+    }
 }
diff --git a/modules/snapshot/files/cron/dumpcategoriesrdf.sh 
b/modules/snapshot/files/cron/dumpcategoriesrdf.sh
new file mode 100755
index 0000000..3153aa9
--- /dev/null
+++ b/modules/snapshot/files/cron/dumpcategoriesrdf.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+#############################################################
+# This file is maintained by puppet!
+# modules/snapshot/cron/dumpcategoriesrdf.sh
+#############################################################
+#
+# Generate an RDF dump of categories for all wikis in
+# categories-rdf list and remove old ones.
+
+source /usr/local/etc/set_dump_dirs.sh
+
+usage() {
+       echo "Usage: $0 --list wikis.dblist [--config <pathtofile>] [--dryrun]"
+       echo
+       echo "  --config  path to configuration file for dump generation"
+       echo "            (default value: ${confsdir}/wikidump.conf"
+       echo "  --list    file containing list of the wikis to dump"
+       echo "  --dryrun  don't run dump, show what would have been done"
+       exit 1
+}
+
+configFile="${confsdir}/wikidump.conf"
+dryrun="false"
+dumpFormat="ttl"
+dbList="categories-rdf"
+
+while [ $# -gt 0 ]; do
+       if [ $1 == "--config" ]; then
+               configFile="$2"
+               shift; shift;
+       elif [ $1 == "--dryrun" ]; then
+               dryrun="true"
+               shift
+       elif [ $1 == "--list" ]; then
+               dbList="$2"
+               shift; shift;
+       else
+               echo "$0: Unknown option $1"
+               usage
+       fi
+done
+
+if [ -z "$dbList" -o ! -f "$dbList" ]; then
+       echo "Valid wiki list must be specified"
+       echo "Exiting..."
+       exit 1
+fi
+
+if [ ! -f "$configFile" ]; then
+       echo "Could not find config file: $configFile"
+       echo "Exiting..."
+       exit 1
+fi
+
+deployDir=$(egrep "^dir=" "$configFile" | mawk -Fdir= '{ print $2 }')
+gzip=$(egrep "^gzip=" "$configFile" | mawk -Fgzip= '{ print $2 }')
+privateList=$(egrep "^privatelist=" "$configFile" | mawk -Fprivatelist= '{ 
print $2 }')
+publicDir=$(egrep "^public=" "$configFile" | mawk -Fpublic= '{ print $2 }')
+
+if [ -z "$deployDir" -o -z "$gzip" -o -z "$privateList" -o -z "$publicDir" ]; 
then
+       echo "failed to find value of one of the following from config file 
$configFile:"
+       echo "gzip: $gzip"
+       echo "dir: $deployDir"
+       echo "privatelist: $privateList"
+       echo "public: $publicDir"
+       echo "exiting..."
+       exit 1
+fi
+
+today=$(date +'%Y%m%d')
+targetDirBase="${publicDir}/other/categoriesrdf"
+targetDir="${targetDirBase}/${today}"
+timestampsDir="${targetDirBase}/lastdump"
+multiVersionScript="${deployDir}/multiversion/MWScript.php"
+
+# remove old datasets
+daysToKeep=70
+cutOff=$(( $(date +%s) - $(( $daysToKeep + 1 )) * 24 * 3600))
+if [ -d "$targetDirBase" ]; then
+       for folder in $(ls -d -r "${targetDirBase}/"*); do
+               creationTime=$(date --utc --date="$(basename $folder)" +%s 
2>/dev/null)
+               if [ -n "$creationTime" ]; then
+                   if [ "$cutOff" -gt "$creationTime" ]; then
+                       if [ "$dryrun" == "true" ]; then
+                               echo rm "${folder}/"*".${dumpFormat}.gz"
+                               echo rmdir "${folder}"
+                       else
+                               rm -f "${folder}/"*".${dumpFormat}.gz"
+                               rmdir "${folder}"
+                       fi
+                   fi
+               fi
+       done
+fi
+
+# create todays folder
+if [ "$dryrun" == "true" ]; then
+       echo mkdir -p "$targetDir"
+       echo mkdir -p "$timestampsDir"
+else
+       if ! mkdir -p "$targetDir"; then
+               echo "Can't make output directory: $targetDir"
+               echo "Exiting..."
+               exit 1
+       fi
+       if ! mkdir -p "$timestampsDir"; then
+               echo "Can't make output directory: $timestampsDir"
+               echo "Exiting..."
+               exit 1
+       fi
+fi
+
+# iterate over configured wikis
+cat "$dbList" | while read wiki; do
+       # exclude all private wikis
+       if ! egrep -q "^${wiki}$" "$privateList"; then
+               filename="${wiki}-${today}-categories"
+               targetFile="${targetDir}/${filename}.${dumpFormat}.gz"
+               tsFile="${timestampsDir}/${wiki}-categories.last"
+               if [ "$dryrun" == "true" ]; then
+                       echo "php $multiVersionScript 
maintenance/dumpCategoriesAsRdf.php --wiki=$wiki --format=$dumpFormat 2> 
/var/log/categoriesrdf/${filename}.log | $gzip > $targetFile"
+               else
+                       php "$multiVersionScript" 
maintenance/dumpCategoriesAsRdf.php --wiki="$wiki" --format="$dumpFormat" 2> 
"/var/log/categoriesrdf/${filename}.log" | "$gzip" > "$targetFile"
+                       echo "$today" > "$tsFile"
+               fi
+       fi
+done
+
+
+# Maintain a 'latest' symlink always pointing at the most recently completed 
dump
+if [ "$dryrun" == "false" ]; then
+       cd "$targetDirBase"
+       ln -snf "$today" "latest"
+fi
diff --git a/modules/snapshot/files/cron/logrotate.categoriesrdf 
b/modules/snapshot/files/cron/logrotate.categoriesrdf
new file mode 100644
index 0000000..af0cac5
--- /dev/null
+++ b/modules/snapshot/files/cron/logrotate.categoriesrdf
@@ -0,0 +1,11 @@
+# This file is managed by puppet
+# puppet:///modules/snapshot/cron/logrotate.categoriesrdf
+#
+/var/log/categoriesrdf/*.log {
+    daily
+    compress
+    delaycompress
+    missingok
+    maxage 22
+    nocreate
+}
diff --git a/modules/snapshot/manifests/cron.pp 
b/modules/snapshot/manifests/cron.pp
index 9e8441e..c5a25a6 100644
--- a/modules/snapshot/manifests/cron.pp
+++ b/modules/snapshot/manifests/cron.pp
@@ -4,6 +4,7 @@
     class { '::snapshot::cron::mediaperprojectlists': user => $user }
     class { '::snapshot::cron::pagetitles': user   => $user }
     class { '::snapshot::cron::cirrussearch': user   => $user }
+    class { '::snapshot::cron::categoriesrdf': user   => $user }
     class { '::snapshot::cron::dumplists': user   => $user }
     class { '::snapshot::cron::dump_global_blocks': user   => $user }
     class { '::snapshot::cron::wikidatadumps::json': user   => $user }
diff --git a/modules/snapshot/manifests/cron/categoriesrdf.pp 
b/modules/snapshot/manifests/cron/categoriesrdf.pp
new file mode 100644
index 0000000..8312683
--- /dev/null
+++ b/modules/snapshot/manifests/cron/categoriesrdf.pp
@@ -0,0 +1,37 @@
+class snapshot::cron::categoriesrdf(
+    $user   = undef,
+) {
+    $confsdir = $snapshot::dumps::dirs::confsdir
+    $apachedir =  $snapshot::dumps::dirs::apachedir
+
+    file { '/var/log/categoriesrdf':
+        ensure => 'directory',
+        mode   => '0644',
+        owner  => $user,
+    }
+
+    logrotate::conf { 'categoriesrdf':
+        ensure => present,
+        source => 'puppet:///modules/snapshot/cron/logrotate.categoriesrdf',
+    }
+
+    $scriptpath = '/usr/local/bin/dumpcategoriesrdf.sh'
+    file { $scriptpath:
+        mode   => '0755',
+        owner  => 'root',
+        group  => 'root',
+        source => 'puppet:///modules/snapshot/cron/dumpcategoriesrdf.sh',
+    }
+
+    cron { 'categoriesrdf-dump':
+        ensure      => 'present',
+        command     => "${scriptpath} --config ${confsdir}/wikidump.conf 
--list ${apachedir}/dblists/categories-rdf.dblist",
+        environment => 'MAILTO=ops-du...@wikimedia.org',
+        user        => $user,
+        minute      => '0',
+        hour        => '20',
+        weekday     => '6',
+        require     => File[$scriptpath],
+    }
+}
+

-- 
To view, visit https://gerrit.wikimedia.org/r/373354
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Idc3710f13d2ab03006011850bec98ee168e247c5
Gerrit-PatchSet: 11
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Smalyshev <smalys...@wikimedia.org>
Gerrit-Reviewer: ArielGlenn <ar...@wikimedia.org>
Gerrit-Reviewer: Gehel <guillaume.leder...@wikimedia.org>
Gerrit-Reviewer: Hoo man <h...@online.de>
Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to