C. Scott Ananian has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/316236

Change subject: WIP: fix more language converter bugs.
......................................................................

WIP: fix more language converter bugs.

Change-Id: Id6688d72175470148fb3295771f10cf2e802caa5
---
M lib/config/WikiConfig.js
M lib/html2wt/LanguageVariantHandler.js
M lib/mw/ApiRequest.js
M lib/wt2html/tt/LanguageVariantHandler.js
M tests/parserTests.txt
5 files changed, 150 insertions(+), 51 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/36/316236/1

diff --git a/lib/config/WikiConfig.js b/lib/config/WikiConfig.js
index 37a4f2a..30cac7e 100644
--- a/lib/config/WikiConfig.js
+++ b/lib/config/WikiConfig.js
@@ -514,6 +514,17 @@
        // Function hooks on this wiki, indexed by their normalized form
        this.functionHooks = new Set(resultConf.functionhooks || []);
 
+       // Somewhat annoyingly, LanguageConversion is turned on by default
+       // for all wikis (ie, $wgDisableLangConversion = false, as reported
+       // by siteinfo general.langconversion), but the
+       // -{ }- syntax is only parsed when the currently selected language
+       // has variants.  For the most part, the "language" is
+       // siteinfo general.lang and the variants are in siteinfo as lists in
+       // general.fallback and general.variants. *However* various mechanisms
+       // (like the Translate extension) can change the default language
+       // for a given page, and then we need to use other mechanisms to see
+       // if variants are active.
+
        // Match a wikitext line containing just whitespace, comments, and
        // sol transparent links and behavior switches.
        // Redirects should not contain any preceding non-whitespace chars.
diff --git a/lib/html2wt/LanguageVariantHandler.js 
b/lib/html2wt/LanguageVariantHandler.js
index cedbb3d..1f9cc0a 100644
--- a/lib/html2wt/LanguageVariantHandler.js
+++ b/lib/html2wt/LanguageVariantHandler.js
@@ -61,15 +61,19 @@
        };
 
        // Helper function: protect characters not allowed in -{ }- strings.
+       // XXX escape := in left side of unidir rules, and in language names (!)
        var protect = function(t) {
-               // XXX escape - characters that would form -{ }- markup.
-               // XXX escape confusing | : ; characters?
-               // XXX ensure that this wikitext won't break the -{}- context
-               return t;
+               return true ? t : t.replace(/[\{\}|:]/g, function(c) {
+                       // Escape {} characters that would form -{ }- markup
+                       // Escape | that would form a lang_variant_flag
+                       // Escape : that would form a lang_variant_option
+                       return '&#' + c.charCodeAt(0) + ';';
+               });
        };
        // Helper function: combine the three parts of the -{ }- string
        var combine = function(flagStr, bodyStr, useTrailingSemi) {
-               if (flagStr || /|/.test(bodyStr)) { flagStr = flagStr + '|'; }
+               console.log('combine', 
JSON.stringify({flagStr:flagStr,bodyStr:bodyStr,trailing:useTrailingSemi}));
+               if (flagStr || /\|/.test(bodyStr)) { flagStr = flagStr + '|'; }
                if (useTrailingSemi) { bodyStr = bodyStr + ';'; }
                return flagStr + bodyStr;
        };
diff --git a/lib/mw/ApiRequest.js b/lib/mw/ApiRequest.js
index 886f856..6edbf7b 100644
--- a/lib/mw/ApiRequest.js
+++ b/lib/mw/ApiRequest.js
@@ -76,6 +76,8 @@
 var latestSerial = 0;
 
 // all revision properties which parsoid is interested in.
+// XXX in theory I should ask for a pagelanguage here?
+// XXX need to add prop=info in order to get that!  see ApiQueryInfo
 var PARSOID_RVPROP = 
('content|ids|timestamp|user|userid|size|sha1|contentmodel|comment');
 
 var logAPIWarnings = function(req, data) {
diff --git a/lib/wt2html/tt/LanguageVariantHandler.js 
b/lib/wt2html/tt/LanguageVariantHandler.js
index c6c20ba..ca1b4df 100644
--- a/lib/wt2html/tt/LanguageVariantHandler.js
+++ b/lib/wt2html/tt/LanguageVariantHandler.js
@@ -76,7 +76,7 @@
        var attribs = token.attribs;
        var dataAttribs = token.dataAttribs;
        var tsr = dataAttribs.tsr;
-       var flags = dataAttribs.flags;
+       var flags = dataAttribs.flags || [];
        var isMeta = false;
 
        cb({async: true});
@@ -143,35 +143,38 @@
                        }
                });
 
-               if (flags.length === 0 && dataAttribs.variants.length > 0) {
+               var dataMW;
+               if (flags.length==0 && dataAttribs.variants.length > 0) {
                        // "Restrict possible variants to a limited set"
-                       return { filter: dataAttribs.variants, text: 
texts[0].text };
-               }
-               var dataMW = flags.reduce(function(dmw, f) {
-                       if (LC_FLAG_MAP.has(f)) {
-                               if (LC_FLAG_MAP.get(f)) {
-                                       dmw[LC_FLAG_MAP.get(f)] = true;
-                               }
-                       } else {
-                               dmw.error = true;
-                       }
-                       return dmw;
-               }, {});
-               // (this test is done at the top of 
ConverterRule::getRuleConvertedStr)
-               if (texts.length === 1 && !texts[0].lang) {
-                       dataMW.disabled = true;
-               }
-               isMeta = !dataMW.show;
-               if (dataMW.disabled) {
-                       dataMW.text = texts[0].text;
-                       dataMW.show = (dataMW.title || dataMW.add) ? undefined 
: true;
-                       isMeta = !dataMW.show;
-               } else if (sawBidir) {
-                       dataMW.bidir = bidir;
-                       if (sawUnidir) { dataMW.error = true; }
+                       // XXX can't return directly.
+                       dataMW = { filter: dataAttribs.variants, text: 
texts[0].text };
                } else {
-                       dataMW.unidir = unidir;
-                       if (!sawUnidir) { dataMW.error = true; }
+                       dataMW = flags.reduce(function(dmw, f) {
+                               if (LC_FLAG_MAP.has(f)) {
+                                       if (LC_FLAG_MAP.get(f)) {
+                                               dmw[LC_FLAG_MAP.get(f)] = true;
+                                       }
+                               } else {
+                                       dmw.error = true;
+                               }
+                               return dmw;
+                       }, {});
+                       // (this test is done at the top of 
ConverterRule::getRuleConvertedStr)
+                       if (texts.length === 1 && !texts[0].lang) {
+                               dataMW.disabled = true;
+                       }
+                       isMeta = !dataMW.show;
+                       if (dataMW.disabled) {
+                               dataMW.text = texts[0].text;
+                               dataMW.show = (dataMW.title || dataMW.add) ? 
undefined : true;
+                               isMeta = !dataMW.show;
+                       } else if (sawBidir) {
+                               dataMW.bidir = bidir;
+                               if (sawUnidir) { dataMW.error = true; }
+                       } else {
+                               dataMW.unidir = unidir;
+                               if (!sawUnidir) { dataMW.error = true; }
+                       }
                }
 
                // Our markup is always the same, except for the contents of the
diff --git a/tests/parserTests.txt b/tests/parserTests.txt
index f34517f..f966cbd 100644
--- a/tests/parserTests.txt
+++ b/tests/parserTests.txt
@@ -17055,7 +17055,7 @@
 </p>
 !! html/parsoid
 <p>this bit is safe: }-</p>
-<p>but if we add a conversion instance: <span typeof="mw:LanguageVariant" 
data-mw='{"bidir":[{"l":"zh-cn","t":"xxx"},{"l":"zh-tw","t":"yyy"}],"show":true}'></span></p>
+<p>but if we add a conversion instance: <span typeof="mw:LanguageVariant" 
data-parsoid='{"fl":[]}' 
data-mw='{"bidir":[{"l":"zh-cn","t":"xxx"},{"l":"zh-tw","t":"yyy"}],"show":true}'></span></p>
 <p>then we get cut off here: }-</p>
 <p>all additional text is vanished</p>
 !! end
@@ -19908,6 +19908,18 @@
 <p><span typeof="mw:LanguageVariant" 
data-mw='{"bidir":[{"l":"zh","t":"China"},{"l":"zh-tw","t":"Taiwan"}],"show":true}'></span>,
 not China</p>
 !! end
 
+!! test
+Filter syntax for language variants
+!! options
+language=zh variant=zh-tw
+!! wikitext
+foo-{zh;zh-hans;zh-hant|blog, WEBJOURNAL, WEBLOG}-quux
+!! html/php
+<p>fooblog, WEBJOURNAL, WEBLOGquux
+</p>
+!! html/parsoid
+<p>foo<span typeof="mw:LanguageVariant" 
data-mw='{"filter":["zh","zh-hans","zh-hant"],"text":"blog, WEBJOURNAL, 
WEBLOG"}'></span>quux</p>
+!! end
 
 # Note that Parsoid post-processing for language variants needs to
 # update the `title` attribute here, based on the mw:ExpandedAttrs property
@@ -20116,49 +20128,43 @@
 <p><span typeof="mw:LanguageVariant" 
data-mw='{"bidir":[{"l":"zh-cn","t":"0"},{"l":"zh-sg","t":"1"},{"l":"zh-tw","t":"2"},{"l":"zh-hk","t":"3"}],"show":true}'></span></p>
 !! end
 
-# Parsoid html2wt mode disabled due to use of trailing semicolon in rule.
 !! test
 Conversion rules from [numeric-only string] to [something else] (T48634)
 !! options
 language=zh variant=zh-cn
-parsoid=wt2html,wt2wt,html2html
 !! input
 
-{H|0=>zh-cn:B}--{H|0=>zh-cn:C;0=>zh-cn:D}--{H|0=>zh-hans:A}-012345-{A|zh-tw:0;zh-cn:E;}-012345
 !! html/php
 <p>D12345EE12345
 </p>
 !! html/parsoid
-<p><meta typeof="mw:LanguageVariant" 
data-mw='{"add":true,"unidir":[{"f":"0","l":"zh-cn","t":"B"}]}'/><meta 
typeof="mw:LanguageVariant" 
data-mw='{"add":true,"unidir":[{"f":"0","l":"zh-cn","t":"C"},{"f":"0","l":"zh-cn","t":"D"}]}'/><meta
 typeof="mw:LanguageVariant" 
data-mw='{"add":true,"unidir":[{"f":"0","l":"zh-hans","t":"A"}]}'/>012345<span 
typeof="mw:LanguageVariant" 
data-mw='{"add":true,"bidir":[{"l":"zh-tw","t":"0"},{"l":"zh-cn","t":"E"}],"show":true}'></span>012345</p>
+<p><meta typeof="mw:LanguageVariant" 
data-mw='{"add":true,"unidir":[{"f":"0","l":"zh-cn","t":"B"}]}'/><meta 
typeof="mw:LanguageVariant" 
data-mw='{"add":true,"unidir":[{"f":"0","l":"zh-cn","t":"C"},{"f":"0","l":"zh-cn","t":"D"}]}'/><meta
 typeof="mw:LanguageVariant" 
data-mw='{"add":true,"unidir":[{"f":"0","l":"zh-hans","t":"A"}]}'/>012345<span 
typeof="mw:LanguageVariant" data-parsoid='{"ts":true,"fl":["A"]}' 
data-mw='{"add":true,"bidir":[{"l":"zh-tw","t":"0"},{"l":"zh-cn","t":"E"}],"show":true}'></span>012345</p>
 !! end
 
-# Parsoid html2wt mode disabled due to use of trailing semicolon in rule.
 !! test
 Bidirectional converter rule entries with an empty value should be ignored 
(T53551)
 !! options
 language=zh variant=zh-cn
-parsoid=wt2html,wt2wt,html2html
 !! input
 -{H|zh-cn:foo;zh-tw:;}-foobar
 !! html/php
 <p>foobar
 </p>
 !! html/parsoid
-<p><meta typeof="mw:LanguageVariant" 
data-mw='{"add":true,"bidir":[{"l":"zh-cn","t":"foo"},{"l":"zh-tw","t":""}]}'/>foobar</p>
+<p><meta typeof="mw:LanguageVariant" data-parsoid='{"ts":true}' 
data-mw='{"add":true,"bidir":[{"l":"zh-cn","t":"foo"},{"l":"zh-tw","t":""}]}'/>foobar</p>
 !! end
 
-# Parsoid html2wt mode disabled due to use of trailing semicolon in rule.
 !! test
 Unidirectional converter rule entries with an empty "from" string should be 
ignored (T53551)
 !! options
 language=zh variant=zh-cn
-parsoid=wt2html,wt2wt,html2html
 !! input
 -{H|=>zh-cn:foo;}-foobar
 !! html/php
 <p>foobar
 </p>
 !! html/parsoid
-<p><meta typeof="mw:LanguageVariant" 
data-mw='{"add":true,"unidir":[{"f":"","l":"zh-cn","t":"foo"}]}'/>foobar</p>
+<p><meta typeof="mw:LanguageVariant" data-parsoid='{"ts":true}' 
data-mw='{"add":true,"unidir":[{"f":"","l":"zh-cn","t":"foo"}]}'/>foobar</p>
 !! end
 
 !! test
@@ -20261,12 +20267,10 @@
 <p><span typeof="mw:LanguageVariant" 
data-mw='{"disabled":true,"show":true,"text":"=&lt;span typeof=\"mw:Entity\" 
data-parsoid=&#39;{\"src\":\"&amp;amp;gt;\",\"srcContent\":\">\",\"dsr\":[5,9,null,null]}&#39;>>&lt;/span>"}'></span></p>
 !!end
 
-# Parsoid html2wt mode disabled due to R flag (which is implicit in html2wt)
 !! test
 Don't break link parsing if language converter markup is in the caption.
 !! options
 language=sr variant=sr-ec
-parsoid=wt2html,wt2wt,html2html
 !! wikitext
 [[Main Page|-{R|main page}-]]
 !! html/php
@@ -20332,13 +20336,14 @@
 !! test
 Language converter tricky html2wt cases (1)
 !! options
-parsoid=html2wt,html2html
+language=sr
+parsoid=html2wt,wt2wt
 !! html/parsoid
 <p><span typeof="mw:LanguageVariant" 
data-mw='{"disabled":true,"show":true,"text":"}-"}'></span></p>
 !! wikitext
--{}&#45;}-
+-{R|&#125;-}-
 !! html/php
-<p>-{}&#45;}-
+<p>&#125;-
 </p>
 !! end
 
@@ -20347,13 +20352,87 @@
 !! test
 Language converter tricky html2wt cases (2)
 !! options
-parsoid=html2wt,html2html
+language=sr
+parsoid=html2wt,wt2wt
 !! html/parsoid
 <p>-{foo}-</p>
 !! wikitext
-&#45;{}-
+<nowiki>-{foo}-</nowiki>
 !! html/php
-<p>foo
+<p>-&#123;foo&#125;-
+</p>
+!! end
+
+!! test
+Language converter tricky html2wt cases (3)
+!! options
+language=sr
+parsoid=html2wt,wt2wt
+!! html/parsoid
+<p><span typeof="mw:LanguageVariant" 
data-mw='{"disabled":true,"show":true,"text":"|"}'></span></p>
+<p><span typeof="mw:LanguageVariant" 
data-mw='{"disabled":true,"show":true,"text":"R|raw"}'></span></p>
+<p><span typeof="mw:LanguageVariant" 
data-mw='{"disabled":true,"show":true,"text":"-{foo}-"}'></span></p>
+!! wikitext
+-{R||}-
+
+-{R|R|raw}-
+
+-{<nowiki>-{foo}-</nowiki>}-
+!! html/php
+<p>|
+</p><p>R|raw
+</p><p>-&#123;foo&#125;-
+</p>
+!! end
+
+!! test
+Language converter tricky html2wt cases (4)
+!! options
+language=sr
+parsoid=html2wt,wt2wt
+!! html/parsoid
+<p><span typeof="mw:LanguageVariant" 
data-mw='{"disabled":true,"show":true,"text":"&lt;span about=\"#mwt1\" 
typeof=\"mw:Transclusion\" 
data-parsoid=&#39;{\"pi\":[[{\"k\":\"1\"}]],\"dsr\":[2,14,null,null]}&#39; 
data-mw=&#39;{\"parts\":[{\"template\":{\"target\":{\"wt\":\"echo\",\"href\":\"./Template:Echo\"},\"params\":{\"1\":{\"wt\":\"hey\"}},\"i\":0}}]}&#39;>hey&lt;/span>"}'></span></p>
+!! wikitext
+-{R|{{echo|hey}}}-
+!! html/php
+<p>hey
+</p>
+!! end
+
+# Note that the <nowiki> escaping added by parsoid for source text,
+# destination text, and language names only works on the PHP side
+# for *destination text*.  (HTML entity escaping wouldn't work
+# any better.)  This is probably a bug, at least for source texts.
+# (For language names PHP uses a precise regexp based on the languages
+# it currently knows have variants, which is fragile since this set
+# can grow/shrink over time.)
+!! test
+Language converter tricky html2wt cases (5)
+!! options
+language=zh variant=zh-cn
+parsoid=html2wt,wt2wt
+!! html/parsoid
+<p><meta typeof="mw:LanguageVariant" 
data-mw='{"add":true,"unidir":[{"f":"a:b=>c","l":"zh-cn","t":"x;foo=>zh-cn:boo"},{"f":"bar","l":"zh-cn","t":"bat;xyz=>zh-cn:abc"}]}'/>foobar</p>
+<p><meta typeof="mw:LanguageVariant" 
data-mw='{"add":true,"unidir":[{"f":"A","l":"bo:g;us","t":"B"}]}'/></p>
+<p><span typeof="mw:LanguageVariant" 
data-mw='{"bidir":[{"l":"zh-tw","t":"xyz"},{"l":"zh-cn","t":"0;zh-tw:bar"}],"show":true}'></span></p>
+<p><span typeof="mw:LanguageVariant" 
data-mw='{"bidir":[{"l":"bo:g;us","t":"xyz"},{"l":"zh-cn","t":"abc"}],"show":true}'></span></p>
+<p>a:b=>c xyz</p>
+!! wikitext
+-{H|<nowiki>a:b=>c</nowiki>=>zh-cn:<nowiki>x;foo=>zh-cn:boo</nowiki>;bar=>zh-cn:<nowiki>bat;xyz=>zh-cn:abc</nowiki>}-foobar
+
+-{H|A=><nowiki>bo:g;us</nowiki>:B}-A
+
+-{zh-tw:xyz;zh-cn:<nowiki>0;zh-tw:bar</nowiki>}-
+
+-{<nowiki>bo:g;us</nowiki>:xyz;zh-cn:abc}-
+
+a:b=>c xyz
+!! html/php
+<p>foobat;xyz=&gt;zh-cn:abc
+</p><p>A
+</p><p>0;zh-tw:bar
+</p><p>abc
+</p><p>a:b=&gt;c 0;zh-tw:bar
 </p>
 !! end
 

-- 
To view, visit https://gerrit.wikimedia.org/r/316236
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Id6688d72175470148fb3295771f10cf2e802caa5
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: C. Scott Ananian <canan...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to