GWicke has submitted this change and it was merged.

Change subject: Tweak normalization regexps to allow > in attribute values.
......................................................................


Tweak normalization regexps to allow > in attribute values.

HTML serialization allows > in attribute values; see:
http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#serializing-html-fragments

And in fact we tend to get "<nowiki>" and other similar things in
attribute values.  Update the regular expressions used in
normalization to more accurately match HTML elements which may
contain quoted > in attributes.

Change-Id: I5ec8c3b79618205b7575e102a48fcd16f7479fad
---
M js/lib/mediawiki.Util.js
1 file changed, 10 insertions(+), 4 deletions(-)

Approvals:
  GWicke: Verified; Looks good to me, approved



diff --git a/js/lib/mediawiki.Util.js b/js/lib/mediawiki.Util.js
index d42b628..bbdc8ca 100644
--- a/js/lib/mediawiki.Util.js
+++ b/js/lib/mediawiki.Util.js
@@ -869,16 +869,22 @@
  */
 normalizeOut = function ( out ) {
        // TODO: Do not strip newlines in pre and nowiki blocks!
+       // NOTE that we use a slightly restricted regexp for "attribute"
+       //  which works for the output of DOM serialization.  For example,
+       //  we know that attribute values will be surrounded with double quotes,
+       //  not unquoted or quoted with single quotes.  The serialization
+       //  algorithm is given by:
+       //  
http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#serializing-html-fragments
        out = normalizeNewlines( out );
        return out
-               .replace(/<span 
typeof="mw:(?:(?:Placeholder|Nowiki|Object\/Template|Entity))"[^>]*>((?:[^<]+|(?!<\/span).)*)<\/span>/g,
 '$1')
+               .replace(/<span 
typeof="mw:(?:(?:Placeholder|Nowiki|Object\/Template|Entity))"(?:\s+[^\s"'>\/=]+(?:\s*=\s*"[^"]*")?)*\s*>((?:[^<]+|(?!<\/span).)*)<\/span>/g,
 '$1')
                // Ignore these attributes for now
-               .replace(/ 
(data-parsoid|typeof|resource|rel|prefix|about|rev|datatype|inlist|property|vocab|content|title|class)="[^">]*"/g,
 '')
+               .replace(/ 
(data-parsoid|typeof|resource|rel|prefix|about|rev|datatype|inlist|property|vocab|content|title|class)="[^"]*"/g,
 '')
                // replace mwt ids
                .replace(/\s*id="mwt\d+"/, '')
                //.replace(/<!--.*?-->\n?/gm, '')
-               .replace(/<\/?(?:meta|link)(?: [^>]*)?>/g, '')
-               .replace(/<span[^>]+about="[^]+>/g, '')
+               
.replace(/<\/?(?:meta|link)(?:\s+[^\s"'>\/=]+(?:\s*=\s*"[^"]*")?)*\s*\/?>/g, '')
+               .replace(/<span[^>]+about="[^"]*"[^>]*>/g, '')
                .replace(/<span><\/span>/g, '')
                .replace(/href="(?:\.?\.\/)+/g, 'href="')
                // strip thumbnail size prefixes

-- 
To view, visit https://gerrit.wikimedia.org/r/49498
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I5ec8c3b79618205b7575e102a48fcd16f7479fad
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Cscott <[email protected]>
Gerrit-Reviewer: GWicke <[email protected]>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to