Cscott has uploaded a new change for review. https://gerrit.wikimedia.org/r/61780
Change subject: Let more attribs through in parsoid-only tests ...................................................................... Let more attribs through in parsoid-only tests We need e.g. property and class so we can make sure the new image tests actually test what we need 'em to. Change-Id: Idc0de45d43fdecfd37c730baee10c8ee086e6fd6 Co-authored-by: C. Scott Ananian <[email protected]> Co-authored-by: Mark Holmquist <[email protected]> --- M js/lib/mediawiki.Util.js M js/tests/parserTests.js 2 files changed, 59 insertions(+), 40 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid refs/changes/80/61780/1 diff --git a/js/lib/mediawiki.Util.js b/js/lib/mediawiki.Util.js index 82ab83b..fa365fb 100644 --- a/js/lib/mediawiki.Util.js +++ b/js/lib/mediawiki.Util.js @@ -989,18 +989,21 @@ // and eat all remaining newlines .replace(/[\r\n]/g, ''); -}, +}; /** * @method normalizeOut * * Specialized normalization of the wiki parser output, mostly to ignore a few - * known-ok differences. + * known-ok differences. If parsoidOnly is true-ish, then we allow more + * markup through (like property and typeof attributes), for better + * checking of parsoid-only test cases. * * @param {string} out + * @param {bool} parsoidOnly * @returns {string} */ -normalizeOut = function ( out ) { +var normalizeOut = function ( out, parsoidOnly ) { // TODO: Do not strip newlines in pre and nowiki blocks! // NOTE that we use a slightly restricted regexp for "attribute" // which works for the output of DOM serialization. For example, @@ -1012,23 +1015,29 @@ throw new Error("normalizeOut input is not in standard serialized form"); } out = normalizeNewlines( out ); - return out - .replace(/<span typeof="mw:(?:(?:Placeholder|Nowiki|Object\/Template|Entity))"(?: [^\0-\cZ\s\"\'>\/=]+(?:="[^"]*")?)*>((?:[^<]+|(?!<\/span).)*)<\/span>/g, '$1') - // Ignore these attributes for now - .replace(/ (data-mw|data-parsoid|typeof|resource|rel|prefix|about|rev|datatype|inlist|property|vocab|content|title|class)="[^"]*"/g, '') + if ( !parsoidOnly ) { + // remove <span typeof="....">....</span> + out = out. + replace(/<span typeof="mw:(?:(?:Placeholder|Nowiki|Object\/Template|Entity))"(?: [^\0-\cZ\s\"\'>\/=]+(?:="[^"]*")?)*>((?:[^<]+|(?!<\/span).)*)<\/span>/g, '$1'); + // ignore troublesome attributes + out = out.replace(/ (data-mw|data-parsoid|typeof|resource|rel|prefix|about|rev|datatype|inlist|property|vocab|content|title|class)="[^\"]*"/g, ''); + } else { + out = out.replace(/ (data-mw|data-parsoid|prefix|about|rev|datatype|inlist|vocab|content)="[^\"]*"/g, ''); + } + return out. // replace mwt ids - .replace(/ id="mwt\d+"/, '') + replace(/ id="mwt\d+"/, ''). //.replace(/<!--.*?-->\n?/gm, '') - .replace(/<\/?(?:meta|link)(?: [^\0-\cZ\s"'>\/=]+(?:="[^"]*")?)*\/?>/g, '') - .replace(/<span[^>]+about="[^"]*"[^>]*>/g, '') - .replace(/<span><\/span>/g, '') - .replace(/(href=")(?:\.?\.\/)+/g, '$1') + replace(/<\/?(?:meta|link)(?: [^\0-\cZ\s"'>\/=]+(?:="[^"]*")?)*\/?>/g, ''). + replace(/<span[^>]+about="[^"]*"[^>]*>/g, ''). + replace(/<span><\/span>/g, ''). + replace(/(href=")(?:\.?\.\/)+/g, '$1'). // replace unnecessary URL escaping - .replace(/ href="[^"]*"/g, decodeURIComponent) + replace(/ href="[^"]*"/g, decodeURIComponent). // strip thumbnail size prefixes - .replace(/(src="[^"]*?)\/thumb(\/[0-9a-f]\/[0-9a-f]{2}\/[^\/]+)\/[0-9]+px-[^"\/]+(?=")/g, '$1$2') - .replace(/(<(table|tbody|tr|th|td|\/th|\/td)[^<>]*>)\s+/g, '$1'); -}, + replace(/(src="[^"]*?)\/thumb(\/[0-9a-f]\/[0-9a-f]{2}\/[^\/]+)\/[0-9]+px-[^"\/]+(?=")/g, '$1$2'). + replace(/(<(table|tbody|tr|th|td|\/th|\/td)[^<>]*>)\s+/g, '$1'); +}; /** * @method normalizeHTML @@ -1037,44 +1046,52 @@ * re-serializing it to HTML. Ideally, the parser would normalize inter-tag * whitespace for us. For now, we fake that by simply stripping all newlines. * - * @param source {string} + * If parsoidOnly is true-ish, then we allow more markup through to allow + * more accurate parsoid-only tests. + * + * @param {string} source + * @param {bool} parsoidOnly * @return {string} */ -normalizeHTML = function ( source ) { +var normalizeHTML = function ( source, parsoidOnly ) { // TODO: Do not strip newlines in pre and nowiki blocks! source = normalizeNewlines( source ); try { var doc = this.parseHTML( source ); - return doc.body - .innerHTML + var out = doc.body.innerHTML. // a few things we ignore for now.. - //.replace(/\/wiki\/Main_Page/g, 'Main Page') + //replace(/\/wiki\/Main_Page/g, 'Main Page'). // do not expect a toc for now - .replace(/<table[^>]+?id="toc"[^>]*>.+?<\/table>/mg, '') + replace(/<table[^>]+?id="toc"[^>]*>.+?<\/table>/mg, ''). // do not expect section editing for now - .replace(/<span[^>]+class="mw-headline"[^>]*>(.*?)<\/span> *(<span class="mw-editsection">\[.*?<\/span>)?/g, '$1') + replace(/<span[^>]+class="mw-headline"[^>]*>(.*?)<\/span> *(<span class="mw-editsection">\[.*?<\/span>)?/g, '$1'). // remove empty span tags - .replace(/<span><\/span>/g, '') - // general class and titles, typically on links - .replace(/ (title|class|rel|about|typeof)="[^"]*"/g, '') - // strip red link markup, we do not check if a page exists yet - .replace(/\/index.php\?title=([^']+?)&action=edit&redlink=1/g, '/wiki/$1') + replace(/<span><\/span>/g, ''); + if ( !parsoidOnly ) { + out = out. + // general class and titles, typically on links + replace(/ (title|class|rel|about|typeof)="[^"]*"/g, ''). + // strip red link markup, we do not check if a page exists yet + replace(/\/index.php\?title=([^']+?)&action=edit&redlink=1/g, '/wiki/$1'); + } + out = out. // the expected html has some extra space in tags, strip it - .replace(/<a +href/g, '<a href') - .replace(/href="\/wiki\//g, 'href="') - .replace(/" +>/g, '">') + replace(/<a +href/g, '<a href'). + replace(/href="\/wiki\//g, 'href="'). + replace(/" +>/g, '">'). // replace unnecessary URL escaping - .replace(/ href="[^"]*"/g, decodeURIComponent) + replace(/ href="[^"]*"/g, decodeURIComponent). // strip empty spans - .replace(/<span><\/span>/g, '') - .replace(/(<(table|tbody|tr|th|td|\/th|\/td)[^<>]*>)\s+/g, '$1'); + replace(/<span><\/span>/g, ''). + replace(/(<(table|tbody|tr|th|td|\/th|\/td)[^<>]*>)\s+/g, '$1'); + return out; } catch(e) { - console.log("normalizeHTML failed on" + + console.log("normalizeHTML failed on" + source + " with the following error: " + e); console.trace(); return source; } -}, +}; /** * @method formatHTML @@ -1084,7 +1101,7 @@ * @param {string} source * @returns {string} */ -formatHTML = function ( source ) { +var formatHTML = function ( source ) { return source.replace( /(?!^)<((div|dd|dt|li|p|table|tr|td|tbody|dl|ol|ul|h1|h2|h3|h4|h5|h6)[^>]*)>/g, '\n<$1>'); }, diff --git a/js/tests/parserTests.js b/js/tests/parserTests.js index ef2381d..506ff53 100755 --- a/js/tests/parserTests.js +++ b/js/tests/parserTests.js @@ -1117,11 +1117,12 @@ */ ParserTests.prototype.checkHTML = function ( item, out, options, mode ) { var normalizedOut, normalizedExpected; + var parsoidOnly = (item.options.parsoid !== undefined); - normalizedOut = Util.normalizeOut( out ); + normalizedOut = Util.normalizeOut( out, parsoidOnly ); if ( item.cachedNormalizedHTML === null ) { - normalizedExpected = Util.normalizeHTML( item.result ); + normalizedExpected = Util.normalizeHTML( item.result, parsoidOnly ); item.cachedNormalizedHTML = normalizedExpected; } else { normalizedExpected = item.cachedNormalizedHTML; @@ -1730,6 +1731,7 @@ reportResultXML = function ( title, time, comments, iopts, expected, actual, options, mode, item ) { var timeTotal, testcaseEle; var quick = booleanOption( options.quick ); + var parsoidOnly = (iopts.parsoid !== undefined); if ( mode === 'selser' ) { title += ' ' + JSON.stringify( item.changes ); @@ -1742,7 +1744,7 @@ if ( fail && booleanOption( options.whitelist ) && title in testWhiteList && - Util.normalizeOut( testWhiteList[title] ) === actual.normal ) { + Util.normalizeOut( testWhiteList[title], parsoidOnly ) === actual.normal ) { whitelist = true; fail = false; } -- To view, visit https://gerrit.wikimedia.org/r/61780 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Idc0de45d43fdecfd37c730baee10c8ee086e6fd6 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Parsoid Gerrit-Branch: master Gerrit-Owner: Cscott <[email protected]> Gerrit-Reviewer: MarkTraceur <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
