Divec has uploaded a new change for review.
https://gerrit.wikimedia.org/r/119720
Change subject: Fix entity escaping
......................................................................
Fix entity escaping
Parsoid returns data-* entities that contain HTML tags. Previously we
were only escaping quot, not amp/gt/lt/singlequote, so we needed a
work-round to avoid causing corruption in every article. Other entities
in a wiki article can also contain angle brackets.
Change-Id: Ie3972b9195fb30cd4197c77356d40c45485e051d
---
M server/mt/providers/Rot13.js
M server/segmentation/languages/CXParser.js
2 files changed, 15 insertions(+), 14 deletions(-)
git pull
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/ContentTranslation
refs/changes/20/119720/1
diff --git a/server/mt/providers/Rot13.js b/server/mt/providers/Rot13.js
index e84998e..1e2c624 100644
--- a/server/mt/providers/Rot13.js
+++ b/server/mt/providers/Rot13.js
@@ -51,17 +51,21 @@
var parser = this.parser;
parser.parsedText = '';
/**
- * Entity handler
+ * HTML entity escaper
*/
- function entity( str ) {
- return str.replace( '"', '"' );
+ function escapeEntity( str ) {
+ str.replace(
+ /["'&<>]/g,
+ function ( ch ) { return '&#' + ch.charCodeAt(0) + ';' }
+ )
}
parser.onopentag = function ( tag ) {
var attrName;
parser.parsedText += '<' + tag.name;
for ( attrName in tag.attributes ) {
- parser.parsedText += ' ' + attrName + '="' + entity(
tag.attributes[ attrName ] ) + '"';
+ parser.parsedText += ' ' + attrName + '="' +
+ escapeEntity( tag.attributes[ attrName ] ) +
'"';
}
parser.parsedText += '>';
};
diff --git a/server/segmentation/languages/CXParser.js
b/server/segmentation/languages/CXParser.js
index 429d559..0d9db97 100644
--- a/server/segmentation/languages/CXParser.js
+++ b/server/segmentation/languages/CXParser.js
@@ -50,10 +50,13 @@
};
/**
- * Entity handler
+ * HTML entity escaper
*/
-function entity( str ) {
- return str.replace( '"', '"' );
+function escapeEntity( str ) {
+ return str.replace(
+ /["'&<>]/g,
+ function ( ch ) { return '&#' + ch.charCodeAt(0) + ';' }
+ )
}
/**
@@ -146,13 +149,7 @@
}
attributes = tag.attributes;
for ( attrName in attributes ) {
- if ( attrName === 'data-parsoid' || attrName === 'data-mw' ) {
- // Parsoid gives the html with these attributes and has
- // values as big escaped htmls. The parser has trouble
in
- // not leaking it to the text. So ignore these
attributes.
- continue;
- }
- this.print( ' ' + attrName + '="' + entity(
attributes[attrName] ) + '"' );
+ this.print( ' ' + attrName + '="' + escapeEntity(
attributes[attrName] ) + '"' );
}
// Sections
--
To view, visit https://gerrit.wikimedia.org/r/119720
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie3972b9195fb30cd4197c77356d40c45485e051d
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/ContentTranslation
Gerrit-Branch: master
Gerrit-Owner: Divec <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits