[MediaWiki-commits] [Gerrit] operations/puppet[production]: Set a reasonable --batch-size for Wikidata entity dumps

2017-09-27 Thread ArielGlenn (Code Review)
ArielGlenn has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/380628 )

Change subject: Set a reasonable --batch-size for Wikidata entity dumps
..


Set a reasonable --batch-size for Wikidata entity dumps

I benchmarked this a little and this should make the dumps
about 8% - 10% faster by always asking for about 500 entities
from PrefetchingWikiPageEntityMetaDataAccessor at a time.

Change-Id: I10f12b200cd47ff27898c3f3dacc79610c649eba
---
M modules/snapshot/files/cron/dumpwikidatajson.sh
M modules/snapshot/files/cron/dumpwikidatardf.sh
2 files changed, 4 insertions(+), 4 deletions(-)

Approvals:
  Hoo man: Looks good to me, but someone else must approve
  ArielGlenn: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/modules/snapshot/files/cron/dumpwikidatajson.sh 
b/modules/snapshot/files/cron/dumpwikidatajson.sh
index 80dd30b..5bb3279 100644
--- a/modules/snapshot/files/cron/dumpwikidatajson.sh
+++ b/modules/snapshot/files/cron/dumpwikidatajson.sh
@@ -29,7 +29,7 @@
(
set -o pipefail

errorLog=/var/log/wikidatadump/dumpwikidatajson-$filename-$i.log
-   php5 $multiversionscript 
extensions/Wikidata/extensions/Wikibase/repo/maintenance/dumpJson.php --wiki 
wikidatawiki --shard $i --sharding-factor $shards --snippet 2>> $errorLog | 
gzip -9 > $tempDir/wikidataJson.$i.gz
+   php5 $multiversionscript 
extensions/Wikidata/extensions/Wikibase/repo/maintenance/dumpJson.php --wiki 
wikidatawiki --shard $i --sharding-factor $shards --batch-size `expr $shards \* 
500` --snippet 2>> $errorLog | gzip -9 > $tempDir/wikidataJson.$i.gz
exitCode=$?
if [ $exitCode -gt 0 ]; then
echo -e "\n\n(`date --iso-8601=minutes`) 
Process for shard $i failed with exit code $exitCode" >> $errorLog
@@ -75,7 +75,7 @@
exit 1
fi
fileSize=`stat --printf="%s" $tempFile`
-   if [ $fileSize -lt `expr 105 / $shards` ]; then
+   if [ $fileSize -lt `expr 200 / $shards` ]; then
echo "File size of $tempFile is only $fileSize. Aborting." >> 
$mainLogFile
exit 1
fi
diff --git a/modules/snapshot/files/cron/dumpwikidatardf.sh 
b/modules/snapshot/files/cron/dumpwikidatardf.sh
index 036eec9..1742842 100755
--- a/modules/snapshot/files/cron/dumpwikidatardf.sh
+++ b/modules/snapshot/files/cron/dumpwikidatardf.sh
@@ -43,7 +43,7 @@
 
 declare -A dumpNameToMinSize
 # Sanity check: Minimal size we expect each shard of a certain dump to have
-dumpNameToMinSize=(["all"]=`expr 125 / $shards` ["truthy"]=`expr 
75 / $shards`)
+dumpNameToMinSize=(["all"]=`expr 235 / $shards` ["truthy"]=`expr 
140 / $shards`)
 
 # Try to create the dump (up to three times).
 retries=0
@@ -56,7 +56,7 @@
(
set -o pipefail

errorLog=/var/log/wikidatadump/dumpwikidata$dumpFormat-$filename-$i.log
-   php5 $multiversionscript 
extensions/Wikidata/extensions/Wikibase/repo/maintenance/dumpRdf.php --wiki 
wikidatawiki --shard $i --sharding-factor $shards --format $dumpFormat --flavor 
$dumpFlavor 2>> $errorLog | gzip -9 > 
$tempDir/wikidata$dumpFormat-$dumpName.$i.gz
+   php5 $multiversionscript 
extensions/Wikidata/extensions/Wikibase/repo/maintenance/dumpRdf.php --wiki 
wikidatawiki --shard $i --sharding-factor $shards --batch-size `expr $shards \* 
500` --format $dumpFormat --flavor $dumpFlavor 2>> $errorLog | gzip -9 > 
$tempDir/wikidata$dumpFormat-$dumpName.$i.gz
exitCode=$?
if [ $exitCode -gt 0 ]; then
echo -e "\n\n(`date --iso-8601=minutes`) 
Process for shard $i failed with exit code $exitCode" >> $errorLog

-- 
To view, visit https://gerrit.wikimedia.org/r/380628
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I10f12b200cd47ff27898c3f3dacc79610c649eba
Gerrit-PatchSet: 2
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Hoo man 
Gerrit-Reviewer: ArielGlenn 
Gerrit-Reviewer: Daniel Kinzler 
Gerrit-Reviewer: Hoo man 
Gerrit-Reviewer: Ladsgroup 
Gerrit-Reviewer: jenkins-bot <>

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] operations/puppet[production]: Set a reasonable --batch-size for Wikidata entity dumps

2017-09-25 Thread Hoo man (Code Review)
Hoo man has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/380628 )

Change subject: Set a reasonable --batch-size for Wikidata entity dumps
..

Set a reasonable --batch-size for Wikidata entity dumps

I benchmarked this a little and this should make the dumps
about 8% - 10% faster by always asking for about 500 entities
from PrefetchingWikiPageEntityMetaDataAccessor at a time.

Change-Id: I10f12b200cd47ff27898c3f3dacc79610c649eba
---
M modules/snapshot/files/cron/dumpwikidatajson.sh
M modules/snapshot/files/cron/dumpwikidatardf.sh
2 files changed, 4 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/28/380628/1

diff --git a/modules/snapshot/files/cron/dumpwikidatajson.sh 
b/modules/snapshot/files/cron/dumpwikidatajson.sh
index 80dd30b..5bb3279 100644
--- a/modules/snapshot/files/cron/dumpwikidatajson.sh
+++ b/modules/snapshot/files/cron/dumpwikidatajson.sh
@@ -29,7 +29,7 @@
(
set -o pipefail

errorLog=/var/log/wikidatadump/dumpwikidatajson-$filename-$i.log
-   php5 $multiversionscript 
extensions/Wikidata/extensions/Wikibase/repo/maintenance/dumpJson.php --wiki 
wikidatawiki --shard $i --sharding-factor $shards --snippet 2>> $errorLog | 
gzip -9 > $tempDir/wikidataJson.$i.gz
+   php5 $multiversionscript 
extensions/Wikidata/extensions/Wikibase/repo/maintenance/dumpJson.php --wiki 
wikidatawiki --shard $i --sharding-factor $shards --batch-size `expr $shards \* 
500` --snippet 2>> $errorLog | gzip -9 > $tempDir/wikidataJson.$i.gz
exitCode=$?
if [ $exitCode -gt 0 ]; then
echo -e "\n\n(`date --iso-8601=minutes`) 
Process for shard $i failed with exit code $exitCode" >> $errorLog
@@ -75,7 +75,7 @@
exit 1
fi
fileSize=`stat --printf="%s" $tempFile`
-   if [ $fileSize -lt `expr 105 / $shards` ]; then
+   if [ $fileSize -lt `expr 200 / $shards` ]; then
echo "File size of $tempFile is only $fileSize. Aborting." >> 
$mainLogFile
exit 1
fi
diff --git a/modules/snapshot/files/cron/dumpwikidatardf.sh 
b/modules/snapshot/files/cron/dumpwikidatardf.sh
index 036eec9..1742842 100755
--- a/modules/snapshot/files/cron/dumpwikidatardf.sh
+++ b/modules/snapshot/files/cron/dumpwikidatardf.sh
@@ -43,7 +43,7 @@
 
 declare -A dumpNameToMinSize
 # Sanity check: Minimal size we expect each shard of a certain dump to have
-dumpNameToMinSize=(["all"]=`expr 125 / $shards` ["truthy"]=`expr 
75 / $shards`)
+dumpNameToMinSize=(["all"]=`expr 235 / $shards` ["truthy"]=`expr 
140 / $shards`)
 
 # Try to create the dump (up to three times).
 retries=0
@@ -56,7 +56,7 @@
(
set -o pipefail

errorLog=/var/log/wikidatadump/dumpwikidata$dumpFormat-$filename-$i.log
-   php5 $multiversionscript 
extensions/Wikidata/extensions/Wikibase/repo/maintenance/dumpRdf.php --wiki 
wikidatawiki --shard $i --sharding-factor $shards --format $dumpFormat --flavor 
$dumpFlavor 2>> $errorLog | gzip -9 > 
$tempDir/wikidata$dumpFormat-$dumpName.$i.gz
+   php5 $multiversionscript 
extensions/Wikidata/extensions/Wikibase/repo/maintenance/dumpRdf.php --wiki 
wikidatawiki --shard $i --sharding-factor $shards --batch-size `expr $shards \* 
500` --format $dumpFormat --flavor $dumpFlavor 2>> $errorLog | gzip -9 > 
$tempDir/wikidata$dumpFormat-$dumpName.$i.gz
exitCode=$?
if [ $exitCode -gt 0 ]; then
echo -e "\n\n(`date --iso-8601=minutes`) 
Process for shard $i failed with exit code $exitCode" >> $errorLog

-- 
To view, visit https://gerrit.wikimedia.org/r/380628
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I10f12b200cd47ff27898c3f3dacc79610c649eba
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Hoo man 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits