ArielGlenn has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/373354 )
Change subject: Add RDF dumps for categories
......................................................................
Add RDF dumps for categories
Creates RDF dump for each wiki configured in categories-rdf dblist
in other/categoriesrdf/YYMMDD
Keeps old dumps back for 70 days.
The dumps are kept in TTL formati, e.g.: testwiki.ttl.gz.
Bug: T173892
Change-Id: Idc3710f13d2ab03006011850bec98ee168e247c5
---
M modules/dataset/manifests/dirs.pp
A modules/snapshot/files/cron/dumpcategoriesrdf.sh
A modules/snapshot/files/cron/logrotate.categoriesrdf
M modules/snapshot/manifests/cron.pp
A modules/snapshot/manifests/cron/categoriesrdf.pp
5 files changed, 191 insertions(+), 0 deletions(-)
Approvals:
ArielGlenn: Looks good to me, approved
jenkins-bot: Verified
diff --git a/modules/dataset/manifests/dirs.pp
b/modules/dataset/manifests/dirs.pp
index 4cdfcb2..3a4831c 100644
--- a/modules/dataset/manifests/dirs.pp
+++ b/modules/dataset/manifests/dirs.pp
@@ -16,6 +16,7 @@
$medialistsdir = "${otherdir}/imageinfo"
$pagetitlesdir = "${otherdir}/pagetitles"
$mediatitlesdir = "${otherdir}/mediatitles"
+ $categoriesrdf = "${otherdir}/categoriesrdf"
file { $datadir:
ensure => 'directory',
@@ -129,4 +130,11 @@
owner => 'datasets',
group => 'datasets',
}
+
+ file { $categoriesrdf:
+ ensure => 'directory',
+ mode => '0755',
+ owner => 'datasets',
+ group => 'datasets',
+ }
}
diff --git a/modules/snapshot/files/cron/dumpcategoriesrdf.sh
b/modules/snapshot/files/cron/dumpcategoriesrdf.sh
new file mode 100755
index 0000000..3153aa9
--- /dev/null
+++ b/modules/snapshot/files/cron/dumpcategoriesrdf.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+#############################################################
+# This file is maintained by puppet!
+# modules/snapshot/cron/dumpcategoriesrdf.sh
+#############################################################
+#
+# Generate an RDF dump of categories for all wikis in
+# categories-rdf list and remove old ones.
+
+source /usr/local/etc/set_dump_dirs.sh
+
+usage() {
+ echo "Usage: $0 --list wikis.dblist [--config <pathtofile>] [--dryrun]"
+ echo
+ echo " --config path to configuration file for dump generation"
+ echo " (default value: ${confsdir}/wikidump.conf"
+ echo " --list file containing list of the wikis to dump"
+ echo " --dryrun don't run dump, show what would have been done"
+ exit 1
+}
+
+configFile="${confsdir}/wikidump.conf"
+dryrun="false"
+dumpFormat="ttl"
+dbList="categories-rdf"
+
+while [ $# -gt 0 ]; do
+ if [ $1 == "--config" ]; then
+ configFile="$2"
+ shift; shift;
+ elif [ $1 == "--dryrun" ]; then
+ dryrun="true"
+ shift
+ elif [ $1 == "--list" ]; then
+ dbList="$2"
+ shift; shift;
+ else
+ echo "$0: Unknown option $1"
+ usage
+ fi
+done
+
+if [ -z "$dbList" -o ! -f "$dbList" ]; then
+ echo "Valid wiki list must be specified"
+ echo "Exiting..."
+ exit 1
+fi
+
+if [ ! -f "$configFile" ]; then
+ echo "Could not find config file: $configFile"
+ echo "Exiting..."
+ exit 1
+fi
+
+deployDir=$(egrep "^dir=" "$configFile" | mawk -Fdir= '{ print $2 }')
+gzip=$(egrep "^gzip=" "$configFile" | mawk -Fgzip= '{ print $2 }')
+privateList=$(egrep "^privatelist=" "$configFile" | mawk -Fprivatelist= '{
print $2 }')
+publicDir=$(egrep "^public=" "$configFile" | mawk -Fpublic= '{ print $2 }')
+
+if [ -z "$deployDir" -o -z "$gzip" -o -z "$privateList" -o -z "$publicDir" ];
then
+ echo "failed to find value of one of the following from config file
$configFile:"
+ echo "gzip: $gzip"
+ echo "dir: $deployDir"
+ echo "privatelist: $privateList"
+ echo "public: $publicDir"
+ echo "exiting..."
+ exit 1
+fi
+
+today=$(date +'%Y%m%d')
+targetDirBase="${publicDir}/other/categoriesrdf"
+targetDir="${targetDirBase}/${today}"
+timestampsDir="${targetDirBase}/lastdump"
+multiVersionScript="${deployDir}/multiversion/MWScript.php"
+
+# remove old datasets
+daysToKeep=70
+cutOff=$(( $(date +%s) - $(( $daysToKeep + 1 )) * 24 * 3600))
+if [ -d "$targetDirBase" ]; then
+ for folder in $(ls -d -r "${targetDirBase}/"*); do
+ creationTime=$(date --utc --date="$(basename $folder)" +%s
2>/dev/null)
+ if [ -n "$creationTime" ]; then
+ if [ "$cutOff" -gt "$creationTime" ]; then
+ if [ "$dryrun" == "true" ]; then
+ echo rm "${folder}/"*".${dumpFormat}.gz"
+ echo rmdir "${folder}"
+ else
+ rm -f "${folder}/"*".${dumpFormat}.gz"
+ rmdir "${folder}"
+ fi
+ fi
+ fi
+ done
+fi
+
+# create todays folder
+if [ "$dryrun" == "true" ]; then
+ echo mkdir -p "$targetDir"
+ echo mkdir -p "$timestampsDir"
+else
+ if ! mkdir -p "$targetDir"; then
+ echo "Can't make output directory: $targetDir"
+ echo "Exiting..."
+ exit 1
+ fi
+ if ! mkdir -p "$timestampsDir"; then
+ echo "Can't make output directory: $timestampsDir"
+ echo "Exiting..."
+ exit 1
+ fi
+fi
+
+# iterate over configured wikis
+cat "$dbList" | while read wiki; do
+ # exclude all private wikis
+ if ! egrep -q "^${wiki}$" "$privateList"; then
+ filename="${wiki}-${today}-categories"
+ targetFile="${targetDir}/${filename}.${dumpFormat}.gz"
+ tsFile="${timestampsDir}/${wiki}-categories.last"
+ if [ "$dryrun" == "true" ]; then
+ echo "php $multiVersionScript
maintenance/dumpCategoriesAsRdf.php --wiki=$wiki --format=$dumpFormat 2>
/var/log/categoriesrdf/${filename}.log | $gzip > $targetFile"
+ else
+ php "$multiVersionScript"
maintenance/dumpCategoriesAsRdf.php --wiki="$wiki" --format="$dumpFormat" 2>
"/var/log/categoriesrdf/${filename}.log" | "$gzip" > "$targetFile"
+ echo "$today" > "$tsFile"
+ fi
+ fi
+done
+
+
+# Maintain a 'latest' symlink always pointing at the most recently completed
dump
+if [ "$dryrun" == "false" ]; then
+ cd "$targetDirBase"
+ ln -snf "$today" "latest"
+fi
diff --git a/modules/snapshot/files/cron/logrotate.categoriesrdf
b/modules/snapshot/files/cron/logrotate.categoriesrdf
new file mode 100644
index 0000000..af0cac5
--- /dev/null
+++ b/modules/snapshot/files/cron/logrotate.categoriesrdf
@@ -0,0 +1,11 @@
+# This file is managed by puppet
+# puppet:///modules/snapshot/cron/logrotate.categoriesrdf
+#
+/var/log/categoriesrdf/*.log {
+ daily
+ compress
+ delaycompress
+ missingok
+ maxage 22
+ nocreate
+}
diff --git a/modules/snapshot/manifests/cron.pp
b/modules/snapshot/manifests/cron.pp
index 9e8441e..c5a25a6 100644
--- a/modules/snapshot/manifests/cron.pp
+++ b/modules/snapshot/manifests/cron.pp
@@ -4,6 +4,7 @@
class { '::snapshot::cron::mediaperprojectlists': user => $user }
class { '::snapshot::cron::pagetitles': user => $user }
class { '::snapshot::cron::cirrussearch': user => $user }
+ class { '::snapshot::cron::categoriesrdf': user => $user }
class { '::snapshot::cron::dumplists': user => $user }
class { '::snapshot::cron::dump_global_blocks': user => $user }
class { '::snapshot::cron::wikidatadumps::json': user => $user }
diff --git a/modules/snapshot/manifests/cron/categoriesrdf.pp
b/modules/snapshot/manifests/cron/categoriesrdf.pp
new file mode 100644
index 0000000..8312683
--- /dev/null
+++ b/modules/snapshot/manifests/cron/categoriesrdf.pp
@@ -0,0 +1,37 @@
+class snapshot::cron::categoriesrdf(
+ $user = undef,
+) {
+ $confsdir = $snapshot::dumps::dirs::confsdir
+ $apachedir = $snapshot::dumps::dirs::apachedir
+
+ file { '/var/log/categoriesrdf':
+ ensure => 'directory',
+ mode => '0644',
+ owner => $user,
+ }
+
+ logrotate::conf { 'categoriesrdf':
+ ensure => present,
+ source => 'puppet:///modules/snapshot/cron/logrotate.categoriesrdf',
+ }
+
+ $scriptpath = '/usr/local/bin/dumpcategoriesrdf.sh'
+ file { $scriptpath:
+ mode => '0755',
+ owner => 'root',
+ group => 'root',
+ source => 'puppet:///modules/snapshot/cron/dumpcategoriesrdf.sh',
+ }
+
+ cron { 'categoriesrdf-dump':
+ ensure => 'present',
+ command => "${scriptpath} --config ${confsdir}/wikidump.conf
--list ${apachedir}/dblists/categories-rdf.dblist",
+ environment => '[email protected]',
+ user => $user,
+ minute => '0',
+ hour => '20',
+ weekday => '6',
+ require => File[$scriptpath],
+ }
+}
+
--
To view, visit https://gerrit.wikimedia.org/r/373354
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Idc3710f13d2ab03006011850bec98ee168e247c5
Gerrit-PatchSet: 11
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Smalyshev <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: Hoo man <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits