Divec has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/119720

Change subject: Fix entity escaping
......................................................................

Fix entity escaping

Parsoid returns data-* entities that contain HTML tags. Previously we
were only escaping quot, not amp/gt/lt/singlequote, so we needed a
work-round to avoid causing corruption in every article. Other entities
in a wiki article can also contain angle brackets.

Change-Id: Ie3972b9195fb30cd4197c77356d40c45485e051d
---
M server/mt/providers/Rot13.js
M server/segmentation/languages/CXParser.js
2 files changed, 15 insertions(+), 14 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/ContentTranslation 
refs/changes/20/119720/1

diff --git a/server/mt/providers/Rot13.js b/server/mt/providers/Rot13.js
index e84998e..1e2c624 100644
--- a/server/mt/providers/Rot13.js
+++ b/server/mt/providers/Rot13.js
@@ -51,17 +51,21 @@
        var parser = this.parser;
        parser.parsedText = '';
        /**
-        * Entity handler
+        * HTML entity escaper
         */
-       function entity( str ) {
-               return str.replace( '"', '"' );
+       function escapeEntity( str ) {
+               str.replace(
+                       /["'&<>]/g,
+                       function ( ch ) { return '&#' + ch.charCodeAt(0) + ';' }
+               )
        }
 
        parser.onopentag = function ( tag ) {
                var attrName;
                parser.parsedText += '<' + tag.name;
                for ( attrName in tag.attributes ) {
-                       parser.parsedText += ' ' + attrName + '="' + entity( 
tag.attributes[ attrName ] ) + '"';
+                       parser.parsedText += ' ' + attrName + '="' +
+                               escapeEntity( tag.attributes[ attrName ] ) + 
'"';
                }
                parser.parsedText += '>';
        };
diff --git a/server/segmentation/languages/CXParser.js 
b/server/segmentation/languages/CXParser.js
index 429d559..0d9db97 100644
--- a/server/segmentation/languages/CXParser.js
+++ b/server/segmentation/languages/CXParser.js
@@ -50,10 +50,13 @@
 };
 
 /**
- * Entity handler
+ * HTML entity escaper
  */
-function entity( str ) {
-       return str.replace( '"', '&quot;' );
+function escapeEntity( str ) {
+       return str.replace(
+               /["'&<>]/g,
+               function ( ch ) { return '&#' + ch.charCodeAt(0) + ';' }
+       )
 }
 
 /**
@@ -146,13 +149,7 @@
        }
        attributes = tag.attributes;
        for ( attrName in attributes ) {
-               if ( attrName === 'data-parsoid' || attrName === 'data-mw' ) {
-                       // Parsoid gives the html with these attributes and has
-                       // values as big escaped htmls. The parser has trouble 
in
-                       // not leaking it to the text. So ignore these 
attributes.
-                       continue;
-               }
-               this.print( ' ' + attrName + '="' + entity( 
attributes[attrName] ) + '"' );
+               this.print( ' ' + attrName + '="' + escapeEntity( 
attributes[attrName] ) + '"' );
        }
 
        // Sections

-- 
To view, visit https://gerrit.wikimedia.org/r/119720
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie3972b9195fb30cd4197c77356d40c45485e051d
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/ContentTranslation
Gerrit-Branch: master
Gerrit-Owner: Divec <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to