jenkins-bot has submitted this change and it was merged. Change subject: Remove dumpGrepper files ......................................................................
Remove dumpGrepper files * Can now be installed with npm i -g dumpgrepper * Leaving it out of devDependencies because libxml fails to compile on jenkins and there's no optionalDevDependencies in npm, yet. Change-Id: If21dfcf0575b15776e388e5220d1b6cb811be2f6 --- M tests/README.md D tests/dumpGrepPatterns/martian-endtags.sh D tests/dumpGrepPatterns/misc.txt D tests/dumpGrepper.js D tests/dumpReader.js 5 files changed, 13 insertions(+), 297 deletions(-) Approvals: Cscott: Looks good to me, approved jenkins-bot: Verified diff --git a/tests/README.md b/tests/README.md index 931f959..2be3662 100644 --- a/tests/README.md +++ b/tests/README.md @@ -70,3 +70,16 @@ $ node client Then take a look at [the statistics](http://localhost:8001/). + +== Running the dumpgrepper == + +The dumpgrepper utility is useful to search XML dumps for specific regexp +patterns. With a simple regexp, an enwiki dump can be grepped in ~20 minutes. + +The grepper operates on actual wikitext (with XML encoding removed), so there is +no need to complicate regexps with entities. It supports JavaScript RegExps. + + $ npm install -g dumpgrepper + +More information on [github][https://github.com/wikimedia/dumpgrepper] and the +[mediawiki wiki][https://www.mediawiki.org/wiki/Parsoid/DumpGrepper]. diff --git a/tests/dumpGrepPatterns/martian-endtags.sh b/tests/dumpGrepPatterns/martian-endtags.sh deleted file mode 100755 index b0395cf..0000000 --- a/tests/dumpGrepPatterns/martian-endtags.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/sh - -# extension tag hooks enabled at en.wikipedia.org -exts="categorytree|charinsert|gallery|hiero|imagemap|inputbox|math|nowiki|poem|pre|ref|references|source|syntaxhighlight|timeline" - -wiki="nowiki|includeonly|noinclude|onlyinclude" - -# just the html5 elements -html5s="a|abbr|address|area|article|aside|audio|b|base|bdi|bdo|blockquote|body|br|button|canvas|caption|cite|code|col|colgroup|command|data|datalist|dd|del|details|dfn|div|dl|dt|em|embed|fieldset|figcaption|figure|footer|form|h1|h2|h3|h4|h5|h6|head|header|hgroup|hr|html|i|iframe|img|input|ins|kbd|keygen|label|legend|li|link|map|mark|menu|meta|meter|nav|noscript|object|ol|optgroup|option|output|p|param|pre|progress|q|rp|rt|rtc|ruby|s|samp|script|section|select|small|source|span|strong|style|sub|summary|sup|table|tbody|td|textarea|tfoot|th|thead|time|title|tr|track|u|ul|var|video|wbr" - -htmlold="center|font|tt" - -normaltags="$exts|$wiki|$html5s|$htmlold" - -#regexp="<(?!\/|$exts|$htmls)[^>]*>.*?<!--([^<]+|<(\/|$exts|$htmls)[^>]*>)*<\/(?!$exts|$htmls)[^>]*>" -#regexp="<(?!/|$normaltags)[^&]+>[^&]+<!--[^&-]*</(?!$normaltags)((?!>).)+>" -regexp="</(?=[a-z])(?!$normaltags)[^>]+>" -#regexp="<(?!\/|$exts|$htmls)[^>]*>" - -#echo $regexp - -if [ -z "$1" ];then - echo "Usage: $0 <xmldump.gz>" - exit 1 -fi - -zcat $1 | node ../dumpGrepper.js -i "$regexp" diff --git a/tests/dumpGrepPatterns/misc.txt b/tests/dumpGrepPatterns/misc.txt deleted file mode 100644 index cbbcc7f..0000000 --- a/tests/dumpGrepPatterns/misc.txt +++ /dev/null @@ -1,18 +0,0 @@ -# A collection of misc interesting regexps - -# ISBN links with at least one line break (https://bugzilla.wikimedia.org/show_bug.cgi?id=29025) -(?:(?:RFC|PMID)[ \t\n\r\f]*[\n\f\r]+[ \t\n\r\f]*([0-9]+)|ISBN[ \t\n\r\f]*[\n\f\r]+[ \t\n\r\f]*(\b(?:97[89][ -]?)?(?:[0-9][ -]?){9}[0-9Xx]\b)) - -# ISBN links with at least two line breaks (https://bugzilla.wikimedia.org/show_bug.cgi?id=29025) -(?:(?:RFC|PMID)[ \t\n\r\f]*(?:[\n\f\r][ \t\n\r\f]*){2,}([0-9]+)|ISBN[ \t\n\r\f]*(?:[\n\f\r][ \t\n\r\f]*){2,}(\b(?:97[89][ -]?)?(?:[0-9][ -]?){9}[0-9Xx]\b)) - -# Template:Table_cell_templates in enwiki -{{\s*(?:rh|rh2|yes|Ya|no|Na|coming soon|bad|eliminated|Site active|Site inactive|good|yes2|won|no2|nom|sho|TBA|partial|yes-No|okay|some|any|n/a|BLACK|dunno|Unknown|Depends|Included|dropped|terminated|beta|table-experimental|free|nonfree|proprietary|needs|incorrect|no result|pending|nightly|release-candidate|[?]|unofficial|usually|rarely|sometimes|draw)\s*(?:[|]|}}) - -# cases which aren't the simple '| {{yes}}' case. -[^ \t|]\s*{{\s*(?:rh|rh2|yes|Ya|no|Na|coming soon|bad|eliminated|Site active|Site inactive|good|yes2|won|no2|nom|sho|TBA|partial|yes-No|okay|some|any|n/a|BLACK|dunno|Unknown|Depends|Included|dropped|terminated|beta|table-experimental|free|nonfree|proprietary|needs|incorrect|no result|pending|nightly|release-candidate|[?]|unofficial|usually|rarely|sometimes|draw)\s*(?:[|]|}}) - -# blank lines with more than one comment (bug 41756) -^([ ]*<!--((?!-->).)*-->){2,}[ ]*$ (use with -m option) -# more precise version, avoid those surrounded by newlines -[^\n]\n([ ]*<!--((?!-->).)*-->){2,}[ ]*\n(?!\n) diff --git a/tests/dumpGrepper.js b/tests/dumpGrepper.js deleted file mode 100755 index 546087b..0000000 --- a/tests/dumpGrepper.js +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env node -/** - * A simple dump grepper based on the DumpReader module. - */ -"use strict"; -require( '../lib/core-upgrade.js' ); - -var dumpReader = require('./dumpReader.js'), - events = require('events'), - util = require('util'), - yargs = require('yargs'), - Util = require( '../lib/mediawiki.Util.js' ).Util; - -function DumpGrepper ( regexp ) { - // inherit from EventEmitter - events.EventEmitter.call(this); - this.re = regexp; -} - -util.inherits(DumpGrepper, events.EventEmitter); - -DumpGrepper.prototype.grepRev = function ( revision, onlyFirst ) { - var result = this.re.exec( revision.text ), - matches = []; - while ( result ) { - matches.push( result ); - if ( onlyFirst ) { break; } - result = this.re.exec( revision.text ); - } - if ( matches.length ) { - this.emit( 'match', revision, matches ); - } -}; - -module.exports.DumpGrepper = DumpGrepper; - -if (module === require.main) { - var opts = yargs.usage( 'Usage: zcat dump.xml.gz | $0 <regexp>', { - 'i': { - description: 'Case-insensitive matching', - 'boolean': true, - 'default': false - }, - 'm': { - description: 'Treat ^ and $ as matching beginning/end of *each* line, instead of beginning/end of entire article', - 'boolean': true, - 'default': false - }, - 'color': { - description: 'Highlight matched substring using color. Use --no-color to disable. Default is "auto".', - 'default': 'auto' - }, - 'l': { - description: 'Suppress normal output; instead print the name of each article from which output would normally have been printed.', - 'boolean': true, - 'default': false - } - } ); - var argv = opts.argv; - - if( argv.help ) { - opts.showHelp(); - process.exit( 0 ); - } - Util.setColorFlags( argv ); - - var flags = 'g'; - if( Util.booleanOption( argv.i ) ) { - flags += 'i'; - } - if( Util.booleanOption( argv.m ) ) { - flags += 'm'; - } - - var re = new RegExp( argv._[0], flags ); - var onlyFirst = Util.booleanOption( argv.l ); - - var reader = new dumpReader.DumpReader(), - grepper = new DumpGrepper( re ), - stats = { - revisions: 0, - matches: 0 - }; - - reader.on( 'revision', function ( revision ) { - stats.revisions++; - grepper.grepRev( revision, onlyFirst ); - } ); - - grepper.on( 'match', function ( revision, matches ) { - stats.matches++; - if ( Util.booleanOption( argv.l ) ) { - console.log( revision.page.title ); - return; - } - for ( var i = 0, l = matches.length; i < l; i++ ) { - console.log( '== Match: [[' + revision.page.title + ']] ==' ); - var m = matches[i]; - //console.warn( JSON.stringify( m.index, null, 2 ) ); - console.log( - revision.text.substr( m.index - 40, 40 ) + - m[0].green + - revision.text.substr( m.index + m[0].length, 40 ) ); - } - } ); - - process.stdin.on ( 'end' , function() { - // Print some stats - console.warn( '################################################' ); - console.warn( 'Total revisions: ' + stats.revisions ); - console.warn( 'Total matches: ' + stats.matches ); - console.warn( 'Ratio: ' + (stats.matches / stats.revisions * 100) + '%' ); - console.warn( '################################################' ); - } ); - - process.stdin.on('data', reader.push.bind(reader) ); - process.stdin.setEncoding('utf8'); - process.stdin.resume(); - - -} - diff --git a/tests/dumpReader.js b/tests/dumpReader.js deleted file mode 100644 index 75185f7..0000000 --- a/tests/dumpReader.js +++ /dev/null @@ -1,130 +0,0 @@ -"use strict"; - -var events = require('events'), - util = require('util'), - libxml = require('libxmljs'); // npm install libxmljs - -function DumpReader() { - events.EventEmitter.call(this); - this.makeParser(); -} - -util.inherits(DumpReader, events.EventEmitter); - -/** - * @param {Stream} stream input stream to read XML from - */ -DumpReader.prototype.makeParser = function() { - - var self = this, - stack = [{}], - workspace = {}, - buffer = ''; - - function flip(arr) { - var obj = {}; - arr.forEach(function(val) { - obj[val] = true; - }); - return obj; - } - var textNodes = flip(['id', 'text', 'title', 'minor', 'comment', 'username', 'timestamp']), - boolNodes = flip(['minor', 'redirect']), - ignoreNodes = flip(['mediawiki', 'siteinfo', 'upload', 'thread'] ), - parser = new libxml.SaxPushParser(); - this.parser = parser; - parser.on('startElementNS', function(elem, attrs, prefix, uri, namespaces) { - //console.warn( 'elem: ' + elem ); - if (elem in ignoreNodes) { - /* jshint noempty: false */ // we know this is empty! - // ... - } else if (elem === 'page') { - //console.warn( 'starting page' ); - stack = []; - workspace = {}; - } else if (elem === 'revision') { - stack.push(workspace); - workspace = { - page: workspace - }; - } else if (elem in textNodes || elem in boolNodes) { - buffer = ''; - } else { - stack.push(workspace); - workspace = {}; - } - }); - - parser.on( 'endElementNS', function(elem, prefix, uri) { - // ping something! - if (elem === 'mediawiki') { - self.complete = true; - //stream.pause(); - self.emit('end', {}); - } else if (elem === 'page') { - self.emit('page', workspace); - workspace = stack.pop(); - } else if (elem === 'revision') { - self.emit('revision', workspace); - workspace = stack.pop(); - } else if (elem in textNodes) { - workspace[elem] = buffer; - } else if (elem in boolNodes) { - workspace[elem] = true; - } else { - var current = workspace; - workspace = stack.pop(); - workspace[elem] = current; - } - }); - - parser.on( 'characters', function(chars) { - buffer += chars; - }); - parser.on( 'cdata', function(cdata) { - buffer += cdata; - }); - parser.on( 'endDocument', function() { - // This doesn't seem to run...? - self.complete = true; - //stream.pause(); - self.emit('end', {}); - }); - parser.on( 'error', function(err) { - self.emit('error', err); - // Should we.... stop reading now or what? - }); - -}; - -DumpReader.prototype.push = function( chunk ) { - //console.log( 'dr read' + chunk ); - this.parser.push( chunk ); -}; - - -module.exports.DumpReader = DumpReader; - -if (module === require.main) { - var reader = new DumpReader(); - reader.on('end', function() { - console.log('done!'); - process.exit(); - }); - reader.on('error', function(err) { - console.log('error!', err); - process.exit(1); - }); - reader.on('page', function(page) { - console.log('page', page); - }); - reader.on('revision', function(revision) { - revision.text = revision.text.substr(0, 40); - console.log('revision', revision); - }); - console.log('Reading!'); - process.stdin.setEncoding('utf8'); - - process.stdin.on('data', reader.push.bind(reader) ); - process.stdin.resume(); -} -- To view, visit https://gerrit.wikimedia.org/r/180642 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: If21dfcf0575b15776e388e5220d1b6cb811be2f6 Gerrit-PatchSet: 4 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: master Gerrit-Owner: Arlolra <abrea...@wikimedia.org> Gerrit-Reviewer: Arlolra <abrea...@wikimedia.org> Gerrit-Reviewer: Cscott <canan...@wikimedia.org> Gerrit-Reviewer: GWicke <gwi...@wikimedia.org> Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits