C. Scott Ananian has uploaded a new change for review. https://gerrit.wikimedia.org/r/316236
Change subject: WIP: fix more language converter bugs. ...................................................................... WIP: fix more language converter bugs. Change-Id: Id6688d72175470148fb3295771f10cf2e802caa5 --- M lib/config/WikiConfig.js M lib/html2wt/LanguageVariantHandler.js M lib/mw/ApiRequest.js M lib/wt2html/tt/LanguageVariantHandler.js M tests/parserTests.txt 5 files changed, 150 insertions(+), 51 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid refs/changes/36/316236/1 diff --git a/lib/config/WikiConfig.js b/lib/config/WikiConfig.js index 37a4f2a..30cac7e 100644 --- a/lib/config/WikiConfig.js +++ b/lib/config/WikiConfig.js @@ -514,6 +514,17 @@ // Function hooks on this wiki, indexed by their normalized form this.functionHooks = new Set(resultConf.functionhooks || []); + // Somewhat annoyingly, LanguageConversion is turned on by default + // for all wikis (ie, $wgDisableLangConversion = false, as reported + // by siteinfo general.langconversion), but the + // -{ }- syntax is only parsed when the currently selected language + // has variants. For the most part, the "language" is + // siteinfo general.lang and the variants are in siteinfo as lists in + // general.fallback and general.variants. *However* various mechanisms + // (like the Translate extension) can change the default language + // for a given page, and then we need to use other mechanisms to see + // if variants are active. + // Match a wikitext line containing just whitespace, comments, and // sol transparent links and behavior switches. // Redirects should not contain any preceding non-whitespace chars. diff --git a/lib/html2wt/LanguageVariantHandler.js b/lib/html2wt/LanguageVariantHandler.js index cedbb3d..1f9cc0a 100644 --- a/lib/html2wt/LanguageVariantHandler.js +++ b/lib/html2wt/LanguageVariantHandler.js @@ -61,15 +61,19 @@ }; // Helper function: protect characters not allowed in -{ }- strings. + // XXX escape := in left side of unidir rules, and in language names (!) var protect = function(t) { - // XXX escape - characters that would form -{ }- markup. - // XXX escape confusing | : ; characters? - // XXX ensure that this wikitext won't break the -{}- context - return t; + return true ? t : t.replace(/[\{\}|:]/g, function(c) { + // Escape {} characters that would form -{ }- markup + // Escape | that would form a lang_variant_flag + // Escape : that would form a lang_variant_option + return '&#' + c.charCodeAt(0) + ';'; + }); }; // Helper function: combine the three parts of the -{ }- string var combine = function(flagStr, bodyStr, useTrailingSemi) { - if (flagStr || /|/.test(bodyStr)) { flagStr = flagStr + '|'; } + console.log('combine', JSON.stringify({flagStr:flagStr,bodyStr:bodyStr,trailing:useTrailingSemi})); + if (flagStr || /\|/.test(bodyStr)) { flagStr = flagStr + '|'; } if (useTrailingSemi) { bodyStr = bodyStr + ';'; } return flagStr + bodyStr; }; diff --git a/lib/mw/ApiRequest.js b/lib/mw/ApiRequest.js index 886f856..6edbf7b 100644 --- a/lib/mw/ApiRequest.js +++ b/lib/mw/ApiRequest.js @@ -76,6 +76,8 @@ var latestSerial = 0; // all revision properties which parsoid is interested in. +// XXX in theory I should ask for a pagelanguage here? +// XXX need to add prop=info in order to get that! see ApiQueryInfo var PARSOID_RVPROP = ('content|ids|timestamp|user|userid|size|sha1|contentmodel|comment'); var logAPIWarnings = function(req, data) { diff --git a/lib/wt2html/tt/LanguageVariantHandler.js b/lib/wt2html/tt/LanguageVariantHandler.js index c6c20ba..ca1b4df 100644 --- a/lib/wt2html/tt/LanguageVariantHandler.js +++ b/lib/wt2html/tt/LanguageVariantHandler.js @@ -76,7 +76,7 @@ var attribs = token.attribs; var dataAttribs = token.dataAttribs; var tsr = dataAttribs.tsr; - var flags = dataAttribs.flags; + var flags = dataAttribs.flags || []; var isMeta = false; cb({async: true}); @@ -143,35 +143,38 @@ } }); - if (flags.length === 0 && dataAttribs.variants.length > 0) { + var dataMW; + if (flags.length==0 && dataAttribs.variants.length > 0) { // "Restrict possible variants to a limited set" - return { filter: dataAttribs.variants, text: texts[0].text }; - } - var dataMW = flags.reduce(function(dmw, f) { - if (LC_FLAG_MAP.has(f)) { - if (LC_FLAG_MAP.get(f)) { - dmw[LC_FLAG_MAP.get(f)] = true; - } - } else { - dmw.error = true; - } - return dmw; - }, {}); - // (this test is done at the top of ConverterRule::getRuleConvertedStr) - if (texts.length === 1 && !texts[0].lang) { - dataMW.disabled = true; - } - isMeta = !dataMW.show; - if (dataMW.disabled) { - dataMW.text = texts[0].text; - dataMW.show = (dataMW.title || dataMW.add) ? undefined : true; - isMeta = !dataMW.show; - } else if (sawBidir) { - dataMW.bidir = bidir; - if (sawUnidir) { dataMW.error = true; } + // XXX can't return directly. + dataMW = { filter: dataAttribs.variants, text: texts[0].text }; } else { - dataMW.unidir = unidir; - if (!sawUnidir) { dataMW.error = true; } + dataMW = flags.reduce(function(dmw, f) { + if (LC_FLAG_MAP.has(f)) { + if (LC_FLAG_MAP.get(f)) { + dmw[LC_FLAG_MAP.get(f)] = true; + } + } else { + dmw.error = true; + } + return dmw; + }, {}); + // (this test is done at the top of ConverterRule::getRuleConvertedStr) + if (texts.length === 1 && !texts[0].lang) { + dataMW.disabled = true; + } + isMeta = !dataMW.show; + if (dataMW.disabled) { + dataMW.text = texts[0].text; + dataMW.show = (dataMW.title || dataMW.add) ? undefined : true; + isMeta = !dataMW.show; + } else if (sawBidir) { + dataMW.bidir = bidir; + if (sawUnidir) { dataMW.error = true; } + } else { + dataMW.unidir = unidir; + if (!sawUnidir) { dataMW.error = true; } + } } // Our markup is always the same, except for the contents of the diff --git a/tests/parserTests.txt b/tests/parserTests.txt index f34517f..f966cbd 100644 --- a/tests/parserTests.txt +++ b/tests/parserTests.txt @@ -17055,7 +17055,7 @@ </p> !! html/parsoid <p>this bit is safe: }-</p> -<p>but if we add a conversion instance: <span typeof="mw:LanguageVariant" data-mw='{"bidir":[{"l":"zh-cn","t":"xxx"},{"l":"zh-tw","t":"yyy"}],"show":true}'></span></p> +<p>but if we add a conversion instance: <span typeof="mw:LanguageVariant" data-parsoid='{"fl":[]}' data-mw='{"bidir":[{"l":"zh-cn","t":"xxx"},{"l":"zh-tw","t":"yyy"}],"show":true}'></span></p> <p>then we get cut off here: }-</p> <p>all additional text is vanished</p> !! end @@ -19908,6 +19908,18 @@ <p><span typeof="mw:LanguageVariant" data-mw='{"bidir":[{"l":"zh","t":"China"},{"l":"zh-tw","t":"Taiwan"}],"show":true}'></span>, not China</p> !! end +!! test +Filter syntax for language variants +!! options +language=zh variant=zh-tw +!! wikitext +foo-{zh;zh-hans;zh-hant|blog, WEBJOURNAL, WEBLOG}-quux +!! html/php +<p>fooblog, WEBJOURNAL, WEBLOGquux +</p> +!! html/parsoid +<p>foo<span typeof="mw:LanguageVariant" data-mw='{"filter":["zh","zh-hans","zh-hant"],"text":"blog, WEBJOURNAL, WEBLOG"}'></span>quux</p> +!! end # Note that Parsoid post-processing for language variants needs to # update the `title` attribute here, based on the mw:ExpandedAttrs property @@ -20116,49 +20128,43 @@ <p><span typeof="mw:LanguageVariant" data-mw='{"bidir":[{"l":"zh-cn","t":"0"},{"l":"zh-sg","t":"1"},{"l":"zh-tw","t":"2"},{"l":"zh-hk","t":"3"}],"show":true}'></span></p> !! end -# Parsoid html2wt mode disabled due to use of trailing semicolon in rule. !! test Conversion rules from [numeric-only string] to [something else] (T48634) !! options language=zh variant=zh-cn -parsoid=wt2html,wt2wt,html2html !! input -{H|0=>zh-cn:B}--{H|0=>zh-cn:C;0=>zh-cn:D}--{H|0=>zh-hans:A}-012345-{A|zh-tw:0;zh-cn:E;}-012345 !! html/php <p>D12345EE12345 </p> !! html/parsoid -<p><meta typeof="mw:LanguageVariant" data-mw='{"add":true,"unidir":[{"f":"0","l":"zh-cn","t":"B"}]}'/><meta typeof="mw:LanguageVariant" data-mw='{"add":true,"unidir":[{"f":"0","l":"zh-cn","t":"C"},{"f":"0","l":"zh-cn","t":"D"}]}'/><meta typeof="mw:LanguageVariant" data-mw='{"add":true,"unidir":[{"f":"0","l":"zh-hans","t":"A"}]}'/>012345<span typeof="mw:LanguageVariant" data-mw='{"add":true,"bidir":[{"l":"zh-tw","t":"0"},{"l":"zh-cn","t":"E"}],"show":true}'></span>012345</p> +<p><meta typeof="mw:LanguageVariant" data-mw='{"add":true,"unidir":[{"f":"0","l":"zh-cn","t":"B"}]}'/><meta typeof="mw:LanguageVariant" data-mw='{"add":true,"unidir":[{"f":"0","l":"zh-cn","t":"C"},{"f":"0","l":"zh-cn","t":"D"}]}'/><meta typeof="mw:LanguageVariant" data-mw='{"add":true,"unidir":[{"f":"0","l":"zh-hans","t":"A"}]}'/>012345<span typeof="mw:LanguageVariant" data-parsoid='{"ts":true,"fl":["A"]}' data-mw='{"add":true,"bidir":[{"l":"zh-tw","t":"0"},{"l":"zh-cn","t":"E"}],"show":true}'></span>012345</p> !! end -# Parsoid html2wt mode disabled due to use of trailing semicolon in rule. !! test Bidirectional converter rule entries with an empty value should be ignored (T53551) !! options language=zh variant=zh-cn -parsoid=wt2html,wt2wt,html2html !! input -{H|zh-cn:foo;zh-tw:;}-foobar !! html/php <p>foobar </p> !! html/parsoid -<p><meta typeof="mw:LanguageVariant" data-mw='{"add":true,"bidir":[{"l":"zh-cn","t":"foo"},{"l":"zh-tw","t":""}]}'/>foobar</p> +<p><meta typeof="mw:LanguageVariant" data-parsoid='{"ts":true}' data-mw='{"add":true,"bidir":[{"l":"zh-cn","t":"foo"},{"l":"zh-tw","t":""}]}'/>foobar</p> !! end -# Parsoid html2wt mode disabled due to use of trailing semicolon in rule. !! test Unidirectional converter rule entries with an empty "from" string should be ignored (T53551) !! options language=zh variant=zh-cn -parsoid=wt2html,wt2wt,html2html !! input -{H|=>zh-cn:foo;}-foobar !! html/php <p>foobar </p> !! html/parsoid -<p><meta typeof="mw:LanguageVariant" data-mw='{"add":true,"unidir":[{"f":"","l":"zh-cn","t":"foo"}]}'/>foobar</p> +<p><meta typeof="mw:LanguageVariant" data-parsoid='{"ts":true}' data-mw='{"add":true,"unidir":[{"f":"","l":"zh-cn","t":"foo"}]}'/>foobar</p> !! end !! test @@ -20261,12 +20267,10 @@ <p><span typeof="mw:LanguageVariant" data-mw='{"disabled":true,"show":true,"text":"=<span typeof=\"mw:Entity\" data-parsoid='{\"src\":\"&amp;gt;\",\"srcContent\":\">\",\"dsr\":[5,9,null,null]}'>></span>"}'></span></p> !!end -# Parsoid html2wt mode disabled due to R flag (which is implicit in html2wt) !! test Don't break link parsing if language converter markup is in the caption. !! options language=sr variant=sr-ec -parsoid=wt2html,wt2wt,html2html !! wikitext [[Main Page|-{R|main page}-]] !! html/php @@ -20332,13 +20336,14 @@ !! test Language converter tricky html2wt cases (1) !! options -parsoid=html2wt,html2html +language=sr +parsoid=html2wt,wt2wt !! html/parsoid <p><span typeof="mw:LanguageVariant" data-mw='{"disabled":true,"show":true,"text":"}-"}'></span></p> !! wikitext --{}-}- +-{R|}-}- !! html/php -<p>-{}-}- +<p>}- </p> !! end @@ -20347,13 +20352,87 @@ !! test Language converter tricky html2wt cases (2) !! options -parsoid=html2wt,html2html +language=sr +parsoid=html2wt,wt2wt !! html/parsoid <p>-{foo}-</p> !! wikitext --{}- +<nowiki>-{foo}-</nowiki> !! html/php -<p>foo +<p>-{foo}- +</p> +!! end + +!! test +Language converter tricky html2wt cases (3) +!! options +language=sr +parsoid=html2wt,wt2wt +!! html/parsoid +<p><span typeof="mw:LanguageVariant" data-mw='{"disabled":true,"show":true,"text":"|"}'></span></p> +<p><span typeof="mw:LanguageVariant" data-mw='{"disabled":true,"show":true,"text":"R|raw"}'></span></p> +<p><span typeof="mw:LanguageVariant" data-mw='{"disabled":true,"show":true,"text":"-{foo}-"}'></span></p> +!! wikitext +-{R||}- + +-{R|R|raw}- + +-{<nowiki>-{foo}-</nowiki>}- +!! html/php +<p>| +</p><p>R|raw +</p><p>-{foo}- +</p> +!! end + +!! test +Language converter tricky html2wt cases (4) +!! options +language=sr +parsoid=html2wt,wt2wt +!! html/parsoid +<p><span typeof="mw:LanguageVariant" data-mw='{"disabled":true,"show":true,"text":"<span about=\"#mwt1\" typeof=\"mw:Transclusion\" data-parsoid='{\"pi\":[[{\"k\":\"1\"}]],\"dsr\":[2,14,null,null]}' data-mw='{\"parts\":[{\"template\":{\"target\":{\"wt\":\"echo\",\"href\":\"./Template:Echo\"},\"params\":{\"1\":{\"wt\":\"hey\"}},\"i\":0}}]}'>hey</span>"}'></span></p> +!! wikitext +-{R|{{echo|hey}}}- +!! html/php +<p>hey +</p> +!! end + +# Note that the <nowiki> escaping added by parsoid for source text, +# destination text, and language names only works on the PHP side +# for *destination text*. (HTML entity escaping wouldn't work +# any better.) This is probably a bug, at least for source texts. +# (For language names PHP uses a precise regexp based on the languages +# it currently knows have variants, which is fragile since this set +# can grow/shrink over time.) +!! test +Language converter tricky html2wt cases (5) +!! options +language=zh variant=zh-cn +parsoid=html2wt,wt2wt +!! html/parsoid +<p><meta typeof="mw:LanguageVariant" data-mw='{"add":true,"unidir":[{"f":"a:b=>c","l":"zh-cn","t":"x;foo=>zh-cn:boo"},{"f":"bar","l":"zh-cn","t":"bat;xyz=>zh-cn:abc"}]}'/>foobar</p> +<p><meta typeof="mw:LanguageVariant" data-mw='{"add":true,"unidir":[{"f":"A","l":"bo:g;us","t":"B"}]}'/></p> +<p><span typeof="mw:LanguageVariant" data-mw='{"bidir":[{"l":"zh-tw","t":"xyz"},{"l":"zh-cn","t":"0;zh-tw:bar"}],"show":true}'></span></p> +<p><span typeof="mw:LanguageVariant" data-mw='{"bidir":[{"l":"bo:g;us","t":"xyz"},{"l":"zh-cn","t":"abc"}],"show":true}'></span></p> +<p>a:b=>c xyz</p> +!! wikitext +-{H|<nowiki>a:b=>c</nowiki>=>zh-cn:<nowiki>x;foo=>zh-cn:boo</nowiki>;bar=>zh-cn:<nowiki>bat;xyz=>zh-cn:abc</nowiki>}-foobar + +-{H|A=><nowiki>bo:g;us</nowiki>:B}-A + +-{zh-tw:xyz;zh-cn:<nowiki>0;zh-tw:bar</nowiki>}- + +-{<nowiki>bo:g;us</nowiki>:xyz;zh-cn:abc}- + +a:b=>c xyz +!! html/php +<p>foobat;xyz=>zh-cn:abc +</p><p>A +</p><p>0;zh-tw:bar +</p><p>abc +</p><p>a:b=>c 0;zh-tw:bar </p> !! end -- To view, visit https://gerrit.wikimedia.org/r/316236 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Id6688d72175470148fb3295771f10cf2e802caa5 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: master Gerrit-Owner: C. Scott Ananian <canan...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits