Cscott has uploaded a new change for review. https://gerrit.wikimedia.org/r/49498
Change subject: Tweak normalization regexps to allow > in attribute values. ...................................................................... Tweak normalization regexps to allow > in attribute values. HTML serialization allows > in attribute values; see: http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#serializing-html-fragments And in fact we tend to get "<nowiki>" and other similar things in attribute values. Update the regular expressions used in normalization to more accurately match HTML elements which may contain quoted > in attributes. Change-Id: I5ec8c3b79618205b7575e102a48fcd16f7479fad --- M js/lib/mediawiki.Util.js 1 file changed, 10 insertions(+), 4 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid refs/changes/98/49498/1 diff --git a/js/lib/mediawiki.Util.js b/js/lib/mediawiki.Util.js index d42b628..bbdc8ca 100644 --- a/js/lib/mediawiki.Util.js +++ b/js/lib/mediawiki.Util.js @@ -869,16 +869,22 @@ */ normalizeOut = function ( out ) { // TODO: Do not strip newlines in pre and nowiki blocks! + // NOTE that we use a slightly restricted regexp for "attribute" + // which works for the output of DOM serialization. For example, + // we know that attribute values will be surrounded with double quotes, + // not unquoted or quoted with single quotes. The serialization + // algorithm is given by: + // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#serializing-html-fragments out = normalizeNewlines( out ); return out - .replace(/<span typeof="mw:(?:(?:Placeholder|Nowiki|Object\/Template|Entity))"[^>]*>((?:[^<]+|(?!<\/span).)*)<\/span>/g, '$1') + .replace(/<span typeof="mw:(?:(?:Placeholder|Nowiki|Object\/Template|Entity))"(?:\s+[^\s"'>\/=]+(?:\s*=\s*"[^"]*")?)*\s*>((?:[^<]+|(?!<\/span).)*)<\/span>/g, '$1') // Ignore these attributes for now - .replace(/ (data-parsoid|typeof|resource|rel|prefix|about|rev|datatype|inlist|property|vocab|content|title|class)="[^">]*"/g, '') + .replace(/ (data-parsoid|typeof|resource|rel|prefix|about|rev|datatype|inlist|property|vocab|content|title|class)="[^"]*"/g, '') // replace mwt ids .replace(/\s*id="mwt\d+"/, '') //.replace(/<!--.*?-->\n?/gm, '') - .replace(/<\/?(?:meta|link)(?: [^>]*)?>/g, '') - .replace(/<span[^>]+about="[^]+>/g, '') + .replace(/<\/?(?:meta|link)(?:\s+[^\s"'>\/=]+(?:\s*=\s*"[^"]*")?)*\s*\/?>/g, '') + .replace(/<span[^>]+about="[^"]*"[^>]*>/g, '') .replace(/<span><\/span>/g, '') .replace(/href="(?:\.?\.\/)+/g, 'href="') // strip thumbnail size prefixes -- To view, visit https://gerrit.wikimedia.org/r/49498 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I5ec8c3b79618205b7575e102a48fcd16f7479fad Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Parsoid Gerrit-Branch: master Gerrit-Owner: Cscott <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
