Subramanya Sastry has uploaded a new change for review. https://gerrit.wikimedia.org/r/225328
Change subject: Scripts for generating the title database for rt-testing ...................................................................... Scripts for generating the title database for rt-testing * Bare minium to get this done. Requires some manual work. But, could be tweaked further next time this is required. But, not worth the time now. Bug: T101928 Change-Id: I07728fb757f7e7d7f85c3865576ec816c76aa3cb --- A tests/server/scripts/README A tests/server/scripts/download.sh A tests/server/scripts/fetch_rc.js A tests/server/scripts/gen_titles.sh A tests/server/scripts/importAll.sh A tests/server/scripts/jsonify.js 6 files changed, 198 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid refs/changes/28/225328/1 diff --git a/tests/server/scripts/README b/tests/server/scripts/README new file mode 100644 index 0000000..5fa37f3 --- /dev/null +++ b/tests/server/scripts/README @@ -0,0 +1,20 @@ +1. Prepare the scripts + Update download.sh, gen_titles.sh, fetch_rc.js, importAll.sh + with wikis you want to download from and also set + the # titles and fractions for randomly picked titles + and recently edited pages. + +2. Download latest title dumps by running download.sh + +3. Fetch recently edited titles by running fetch_rc.sh + +4. Run gen_titles.sh to: + * generate random selection of titles + * merge the two sets (random selection, recently edited) of + titles and generate a single unique title list. + * jsonify the list for use with the importJson.js script + +5. Upload/move the json files to the right server/directory + and run the importAll.sh to populate your rt-testing db. + +You are all done! diff --git a/tests/server/scripts/download.sh b/tests/server/scripts/download.sh new file mode 100755 index 0000000..92fd61a --- /dev/null +++ b/tests/server/scripts/download.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +#---- for wikis --- +LANG="en de nl fr it ru es sv pl ja ar he hi ko zh" +# link prefix languages +LANG=$LANG" ckb cu cv hy is kaa ka lbe ln mzn pnb uk uz" + +for l in $LANG ; do + wget http://dumps.wikimedia.org/${l}wiki/latest/${l}wiki-latest-all-titles-in-ns0.gz +done + +#---- for wiktionaries --- +LANG="en fr" +for l in $LANG ; do + wget http://dumps.wikimedia.org/${l}wiktionary/latest/${l}wiktionary-latest-all-titles-in-ns0.gz +done diff --git a/tests/server/scripts/fetch_rc.js b/tests/server/scripts/fetch_rc.js new file mode 100755 index 0000000..710f318 --- /dev/null +++ b/tests/server/scripts/fetch_rc.js @@ -0,0 +1,115 @@ +'use strict'; +require('../../lib/core-upgrade.js'); +var fs = require('fs'); +var request = require('request'); + +var wikis = [ + { prefix: 'enwiki', limit: 30 }, + { prefix: 'dewiki', limit: 10 }, + { prefix: 'nlwiki', limit: 10 }, + { prefix: 'frwiki', limit: 10 }, + { prefix: 'itwiki', limit: 10 }, + { prefix: 'ruwiki', limit: 10 }, + { prefix: 'eswiki', limit: 10 }, + { prefix: 'svwiki', limit: 8 }, + { prefix: 'plwiki', limit: 8 }, + { prefix: 'jawiki', limit: 8 }, + { prefix: 'arwiki', limit: 7 }, + { prefix: 'hewiki', limit: 7 }, + { prefix: 'hiwiki', limit: 7 }, + { prefix: 'kowiki', limit: 7 }, + { prefix: 'zhwiki', limit: 5 }, + { prefix: 'ckbwiki', limit: 1 }, + { prefix: 'cuwiki', limit: 1 }, + { prefix: 'cvwiki', limit: 1 }, + { prefix: 'hywiki', limit: 1 }, + { prefix: 'iswiki', limit: 1 }, + { prefix: 'kaawiki', limit: 1 }, + { prefix: 'kawiki', limit: 1 }, + { prefix: 'lbewiki', limit: 1 }, + { prefix: 'lnwiki', limit: 1 }, + { prefix: 'mznwiki', limit: 1 }, + { prefix: 'pnbwiki', limit: 1 }, + { prefix: 'ukwiki', limit: 1 }, + { prefix: 'uzwiki', limit: 1 }, + { prefix: 'enwiktionary', limit: 1 }, + { prefix: 'frwiktionary', limit: 1 }, +]; + +var processRes, fetchAll; + +processRes = function(fetchArgs, out, err, resp, body) { + if (err || resp.statusCode !== 200) { + if (err) { + console.error('Error: ' + err); + } + if (resp) { + console.error('Status code: ' + resp.statusCode); + } + return; + } + + // Accum titles + body = JSON.parse(body); + Array.prototype.reduce.call(body.query.recentchanges, + function(titles, e) { + titles.push(e.title); + return titles; + }, + out); + + // More to fetch? + var resContinue = body['continue']; + if (resContinue && fetchArgs.count > 0) { + fetchArgs.opts['continue'] = resContinue['continue']; + fetchArgs.opts.rccontinue = resContinue.rccontinue; + fetchAll(fetchArgs, out); + } else { + var fileName = './' + fetchArgs.prefix + '.rc_titles.txt'; + console.warn('Got ' + out.length + ' titles from ' + fetchArgs.prefix + '; writing to ' + fileName); + fs.writeFileSync(fileName, out.join('\n')); + } +}; + +fetchAll = function(fetchArgs, out) { + var n = fetchArgs.count; + var opts = fetchArgs.opts; + opts.rclimit = n < 500 ? n : 500; + var requestOpts = { + method: 'GET', + followRedirect: true, + uri: fetchArgs.uri, + qs: opts + }; + fetchArgs.count -= opts.rclimit; + + // console.log('Fetching ' + opts.rclimit + ' results from ' + fetchArgs.prefix); + request(requestOpts, processRes.bind(null, fetchArgs, out)); +}; + +var FRACTION = 0.31; +wikis.forEach(function(obj) { + var prefix = obj.prefix; + var count = obj.limit*1000*FRACTION; + var domain = prefix.replace(/wiki/, '.wikipedia.org').replace(/wiktionary/, '.wiktionary.org'); + var opts = { + action: 'query', + list: 'recentchanges', + format: 'json', + rcnamespace: '0', + rcprop: 'title', + rcshow: '!bot', + rctoponly: true, + 'continue': '', + }; + + console.log('Processing: ' + prefix); + var fetchArgs = { + prefix: prefix, + count: count, + uri: 'http://' + domain + '/w/api.php', + opts: opts + }; + fetchAll(fetchArgs, []); +}); + diff --git a/tests/server/scripts/gen_titles.sh b/tests/server/scripts/gen_titles.sh new file mode 100755 index 0000000..c5c4d33 --- /dev/null +++ b/tests/server/scripts/gen_titles.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +#---- wikis ---- +LANG="enwiki dewiki nlwiki frwiki itwiki ruwiki eswiki svwiki plwiki jawiki arwiki hewiki hiwiki kowiki zhwiki" +HOWMANY=(30 10 10 10 10 10 10 8 8 8 7 7 7 7 5) +# link prefix languages +LANG=$LANG" ckbwiki cuwiki cvwiki hywiki iswiki kaawiki kawiki lbewiki lnwiki mznwiki pnbwiki ukwiki uzwiki" +HOWMANY=("${HOWMANY[@]}" 1 1 1 1 1 1 1 1 1 1 1 1 1) + +#---- wiktionaries ---- +LANG=$LANG" enwiktionary frwiktionary" +HOWMANY=("${HOWMANY[@]}" 1 1) + +i=0 +FRACTION=700; +for l in $LANG ; do + n=${HOWMANY[$i]} + suffix=".random_titles.txt" + echo $l, $n + zcat ${l}-latest-all-titles-in-ns0.gz | sort -R | head -$[$n*FRACTION] > ${l}${suffix} + head -2 ${l}${suffix} + cat ${l}${suffix} ${l}.rc_titles.txt | sort | uniq | head -$[$n*1000+100] | tail -$[$n*1000] > ${l}.all_titles.txt + $(dirname $0)/jsonify.js ${l}.all_titles.txt > ${l}.json + i=`expr $i + 1` +done diff --git a/tests/server/scripts/importAll.sh b/tests/server/scripts/importAll.sh new file mode 100755 index 0000000..36159e3 --- /dev/null +++ b/tests/server/scripts/importAll.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +#---- wikis ---- +LANG="enwiki dewiki nlwiki frwiki itwiki ruwiki eswiki svwiki plwiki jawiki arwiki hewiki hiwiki kowiki zhwiki" +# link prefix languages +LANG=$LANG" ckbwiki cuwiki cvwiki hywiki iswiki kaawiki kawiki lbewiki lnwiki mznwiki pnbwiki ukwiki uzwiki" +#---- wikis ---- +LANG=$LANG" enwiktionary frwiktionary" + +for l in $LANG; do + echo ${l} + $(dirname $0)/../importJson.js -D testreduce_0715 -u testreduce --prefix ${l} ${l}.json +done diff --git a/tests/server/scripts/jsonify.js b/tests/server/scripts/jsonify.js new file mode 100755 index 0000000..bc62396 --- /dev/null +++ b/tests/server/scripts/jsonify.js @@ -0,0 +1,9 @@ +#!/usr/bin/env node +var fs = require( 'fs' ); + +var filename = process.argv[2]; + +var titles = fs.readFileSync(filename, 'utf8').split(/[\n\r]+/); +console.assert(titles.pop() === ''); // trailing newline. + +console.log(JSON.stringify(titles, null, '\t')); -- To view, visit https://gerrit.wikimedia.org/r/225328 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I07728fb757f7e7d7f85c3865576ec816c76aa3cb Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: master Gerrit-Owner: Subramanya Sastry <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
