C. Scott Ananian has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/393908 )
Change subject: Properly handle short headings
......................................................................
Properly handle short headings
This is a minor issue, but it causes some test case failures which would
otherwise get blamed on I12b2a148f7170d20bd9aacd3b5b8ee1965859592.
Bug: T21910
Change-Id: I11926f2d2365755794d8f8f6647b1f0b02b827ab
---
M lib/wt2html/pegTokenizer.pegjs
M tests/parserTests-blacklist.js
M tests/parserTests.txt
3 files changed, 64 insertions(+), 10 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid
refs/changes/08/393908/1
diff --git a/lib/wt2html/pegTokenizer.pegjs b/lib/wt2html/pegTokenizer.pegjs
index 3dfa339..3f7dfcd 100644
--- a/lib/wt2html/pegTokenizer.pegjs
+++ b/lib/wt2html/pegTokenizer.pegjs
@@ -489,14 +489,25 @@
r:(
s:$'='+ // moved in here to make s accessible to inner action
& { return stops.inc('h'); }
- c:nested_block_line
- e:$'='+
+ ce:(
+ nested_block_line
+ $'='+
+ )?
endTPos:("" { return endOffset(); })
spc:(spaces / comment)*
+ & { stops.dec('h'); return ce || s.length > 2; }
&eolf
{
- stops.dec('h');
- var level = Math.min(s.length, e.length);
+ var c = ce ? ce[0] : '';
+ var e = ce ? ce[1] : '';
+ var level;
+ if (!ce) {
+ // split up heading
+ level = (s.length - 1) >>> 1;
+ c = '='.repeat(s.length - 2*level);
+ s = e = '='.repeat(level);
+ }
+ level = Math.min(s.length, e.length);
level = Math.min(6, level);
// convert surplus equals into text
if (s.length > level) {
diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js
index 9b7e84c..e01d540 100644
--- a/tests/parserTests-blacklist.js
+++ b/tests/parserTests-blacklist.js
@@ -146,7 +146,6 @@
add("wt2html", "TOC with wgMaxTocLevel=3 (T8204)", "<h2 id=\"title_1\"
data-parsoid='{\"dsr\":[0,13,2,2]}'> title 1 </h2>\n<h3 id=\"title_1.1\"
data-parsoid='{\"dsr\":[14,31,3,3]}'> title 1.1 </h3>\n<h4 id=\"title_1.1.1\"
data-parsoid='{\"dsr\":[32,53,4,4]}'> title 1.1.1 </h4>\n<h3 id=\"title_1.2\"
data-parsoid='{\"dsr\":[54,71,3,3]}'> title 1.2 </h3>\n<h2 id=\"title_2\"
data-parsoid='{\"dsr\":[72,85,2,2]}'> title 2 </h2>\n<h3 id=\"title_2.1\"
data-parsoid='{\"dsr\":[86,103,3,3]}'> title 2.1 </h3>");
add("wt2html", "TOC with wgMaxTocLevel=3 and two level four headings (T8204)",
"<h2 id=\"Section_1\" data-parsoid='{\"dsr\":[0,13,2,2]}'>Section 1</h2>\n<h3
id=\"Section_1.1\" data-parsoid='{\"dsr\":[14,31,3,3]}'>Section 1.1</h3>\n<h4
id=\"Section_1.1.1\" data-parsoid='{\"dsr\":[32,53,4,4]}'>Section
1.1.1</h4>\n<h4 id=\"Section_1.1.1.1\"
data-parsoid='{\"dsr\":[54,77,4,4]}'>Section 1.1.1.1</h4>\n<h2 id=\"Section_2\"
data-parsoid='{\"dsr\":[78,91,2,2]}'>Section 2</h2>");
add("wt2html", "TOC regression (T14077)", "<meta property=\"mw:PageProp/toc\"
data-parsoid='{\"src\":\"__TOC__\",\"magicSrc\":\"__TOC__\",\"dsr\":[0,7,null,null]}'/>\n<h2
id=\"title_1\" data-parsoid='{\"dsr\":[8,21,2,2]}'> title 1 </h2>\n<h3
id=\"title_1.1\" data-parsoid='{\"dsr\":[22,39,3,3]}'> title 1.1 </h3>\n<h2
id=\"title_2\" data-parsoid='{\"dsr\":[40,53,2,2]}'> title 2 </h2>");
-add("wt2html", "Short headings with trailing space should match behavior of
Parser::doHeadings (T21910)", "<p data-parsoid='{\"dsr\":[0,100,0,0]}'>===
\nThe line above must have a trailing space!\n=== <!--\n--> <!-- -->\nBut just
in case it doesn't...</p>");
add("wt2html", "Header with special characters (T27462)", "<p
data-parsoid='{\"dsr\":[0,72,0,0]}'>The tooltips shall not show entities to the
user (ie. be double escaped)</p>\n\n<h2 id=\"text_.3E_text\"
data-parsoid='{\"dsr\":[74,91,2,2]}'> text > text </h2>\n<p
data-parsoid='{\"dsr\":[92,101,0,0]}'>section 1</p>\n\n<h2 id=\"text_.3C_text\"
data-parsoid='{\"dsr\":[103,120,2,2]}'> text < text </h2>\n<p
data-parsoid='{\"dsr\":[121,130,0,0]}'>section 2</p>\n\n<h2
id=\"text_.26_text\" data-parsoid='{\"dsr\":[132,149,2,2]}'> text & text
</h2>\n<p data-parsoid='{\"dsr\":[150,159,0,0]}'>section 3</p>\n\n<h2
id=\"text_.27_text\" data-parsoid='{\"dsr\":[161,178,2,2]}'> text ' text
</h2>\n<p data-parsoid='{\"dsr\":[179,188,0,0]}'>section 4</p>\n\n<h2
id=\"text_.22_text\" data-parsoid='{\"dsr\":[190,207,2,2]}'> text \" text
</h2>\n<p data-parsoid='{\"dsr\":[208,217,0,0]}'>section 5</p>");
add("wt2html", "Header with space, plus and underscore as entity", "<p
data-parsoid='{\"dsr\":[0,34,0,0]}'>Id should not contain + for
spaces</p>\n\n<h2 id=\"Space_between_Text\"
data-parsoid='{\"dsr\":[36,60,2,2]}'> Space between Text </h2>\n<p
data-parsoid='{\"dsr\":[61,70,0,0]}'>section 1</p>\n\n<h2
id=\"Space-Entity_between_Text\" data-parsoid='{\"dsr\":[72,111,2,2]}'>
Space-Entity<span typeof=\"mw:Entity\"
data-parsoid='{\"src\":\"&#32;\",\"srcContent\":\"
\",\"dsr\":[87,92,null,null]}'> </span>between<span typeof=\"mw:Entity\"
data-parsoid='{\"src\":\"&#32;\",\"srcContent\":\"
\",\"dsr\":[99,104,null,null]}'> </span>Text </h2>\n<p
data-parsoid='{\"dsr\":[112,121,0,0]}'>section 2</p>\n\n<h2
id=\"Plus.2Bbetween.2BText\" data-parsoid='{\"dsr\":[123,146,2,2]}'>
Plus+between+Text </h2>\n<p data-parsoid='{\"dsr\":[147,156,0,0]}'>section
3</p>\n\n<h2 id=\"Plus-Entity.2Bbetween.2BText\"
data-parsoid='{\"dsr\":[158,196,2,2]}'> Plus-Entity<span typeof=\"mw:Entity\"
data-parsoid='{\"src\":\"&#43;\",\"srcContent\":\"+\",\"dsr\":[172,177,null,null]}'>+</span>between<span
typeof=\"mw:Entity\"
data-parsoid='{\"src\":\"&#43;\",\"srcContent\":\"+\",\"dsr\":[184,189,null,null]}'>+</span>Text
</h2>\n<p data-parsoid='{\"dsr\":[197,206,0,0]}'>section 4</p>\n\n<h2
id=\"Underscore_between_Text\" data-parsoid='{\"dsr\":[208,237,2,2]}'>
Underscore_between_Text </h2>\n<p
data-parsoid='{\"dsr\":[238,247,0,0]}'>section 5</p>\n\n<h2
id=\"Underscore-Entity_between_Text\" data-parsoid='{\"dsr\":[249,293,2,2]}'>
Underscore-Entity<span typeof=\"mw:Entity\"
data-parsoid='{\"src\":\"&#95;\",\"srcContent\":\"_\",\"dsr\":[269,274,null,null]}'>_</span>between<span
typeof=\"mw:Entity\"
data-parsoid='{\"src\":\"&#95;\",\"srcContent\":\"_\",\"dsr\":[281,286,null,null]}'>_</span>Text
</h2>\n<p data-parsoid='{\"dsr\":[294,303,0,0]}'>section 6</p>\n\n<p
data-parsoid='{\"dsr\":[305,501,0,0]}'><a rel=\"mw:WikiLink\"
href=\"./Main_Page#Space_between_Text\"
data-parsoid='{\"stx\":\"simple\",\"a\":{\"href\":\"./Main_Page#Space_between_Text\"},\"sa\":{\"href\":\"#Space
between Text\"},\"dsr\":[305,328,2,2]}'>#Space between Text</a>\n<a
rel=\"mw:WikiLink\" href=\"./Main_Page#Space-Entity_between_Text\"
data-parsoid='{\"stx\":\"simple\",\"a\":{\"href\":\"./Main_Page#Space-Entity_between_Text\"},\"sa\":{\"href\":\"#Space-Entity&#32;between&#32;Text\"},\"dsr\":[329,367,2,2]}'>#Space-Entity
between Text</a>\n<a rel=\"mw:WikiLink\"
href=\"./Main_Page#Plus.2Bbetween.2BText\"
data-parsoid='{\"stx\":\"simple\",\"a\":{\"href\":\"./Main_Page#Plus.2Bbetween.2BText\"},\"sa\":{\"href\":\"#Plus+between+Text\"},\"dsr\":[368,390,2,2]}'>#Plus+between+Text</a>\n<a
rel=\"mw:WikiLink\" href=\"./Main_Page#Plus-Entity.2Bbetween.2BText\"
data-parsoid='{\"stx\":\"simple\",\"a\":{\"href\":\"./Main_Page#Plus-Entity.2Bbetween.2BText\"},\"sa\":{\"href\":\"#Plus-Entity&#43;between&#43;Text\"},\"dsr\":[391,428,2,2]}'>#Plus-Entity+between+Text</a>\n<a
rel=\"mw:WikiLink\" href=\"./Main_Page#Underscore_between_Text\"
data-parsoid='{\"stx\":\"simple\",\"a\":{\"href\":\"./Main_Page#Underscore_between_Text\"},\"sa\":{\"href\":\"#Underscore_between_Text\"},\"dsr\":[429,457,2,2]}'>#Underscore_between_Text</a>\n<a
rel=\"mw:WikiLink\" href=\"./Main_Page#Underscore-Entity_between_Text\"
data-parsoid='{\"stx\":\"simple\",\"a\":{\"href\":\"./Main_Page#Underscore-Entity_between_Text\"},\"sa\":{\"href\":\"#Underscore-Entity&#95;between&#95;Text\"},\"dsr\":[458,501,2,2]}'>#Underscore-Entity_between_Text</a></p>");
add("wt2html", "Headers with excess '=' characters\n(Are similar tests
necessary beyond the 1st level?)", "<h1 id=\"foo.3D\"
data-parsoid='{\"dsr\":[0,6,1,1]}'>foo=</h1>\n<h1 id=\".3Dfoo\"
data-parsoid='{\"dsr\":[7,13,1,1]}'>=foo</h1>\n<h1 id=\"italic_heading.3D\"
data-parsoid='{\"dsr\":[14,35,1,1]}'><i
data-parsoid='{\"dsr\":[15,25,2,2]}'>italic</i> heading=</h1>\n<h1
id=\".3Ditalic_heading\" data-parsoid='{\"dsr\":[36,57,1,1]}'>=<i
data-parsoid='{\"dsr\":[38,48,2,2]}'>italic</i> heading</h1>");
@@ -862,7 +861,6 @@
add("html2wt", "__NOEDITSECTION__ keyword", "== Section 1 ==\n\n== Section 2
==\n");
add("html2wt", "Link inside a section heading", "== Section with a [[wiki/Main
Page|link]] in it ==\n");
add("html2wt", "TOC regression (T14077)", "<div id=\"toc\" class=\"toc\"><div
class=\"toctitle\">\n== Contents ==\n</div>\n\n* [[#title_1|<span
class=\"tocnumber\">1</span> <span class=\"toctext\">title 1</span>]]\n\n**
[[#title_1.1|<span class=\"tocnumber\">1.1</span> <span class=\"toctext\">title
1.1</span>]]\n* [[#title_2|<span class=\"tocnumber\">2</span> <span
class=\"toctext\">title 2</span>]]\n\n</div>\n\n== title 1 ==\n\n=== title 1.1
===\n\n== title 2 ==\n");
-add("html2wt", "Short headings with trailing space should match behavior of
Parser::doHeadings (T21910)", "= = =\nThe line above must have a trailing
space!\n\n= = =\nBut just in case it doesn't...\n");
add("html2wt", "Header with special characters (T27462)", "The tooltips shall
not show entities to the user (ie. be double escaped)\n\n<div id=\"toc\"
class=\"toc\"><div class=\"toctitle\">\n== Contents ==\n</div>\n\n*
[[#text_.3E_text|<span class=\"tocnumber\">1</span> <span
class=\"toctext\">text > text</span>]]\n* [[#text_.3C_text|<span
class=\"tocnumber\">2</span> <span class=\"toctext\">text < text</span>]]\n*
[[#text_.26_text|<span class=\"tocnumber\">3</span> <span
class=\"toctext\">text & text</span>]]\n* [[#text_.27_text|<span
class=\"tocnumber\">4</span> <span class=\"toctext\">text ' text</span>]]\n*
[[#text_.22_text|<span class=\"tocnumber\">5</span> <span
class=\"toctext\">text \" text</span>]]\n\n</div>\n\n== text > text ==\nsection
1\n\n== text < text ==\nsection 2\n\n== text & text ==\nsection 3\n\n== text '
text ==\nsection 4\n\n== text \" text ==\nsection 5\n");
add("html2wt", "Header with space, plus and underscore as entity", "Id should
not contain + for spaces\n\n<div id=\"toc\" class=\"toc\"><div
class=\"toctitle\">\n== Contents ==\n</div>\n\n* [[#Space_between_Text|<span
class=\"tocnumber\">1</span> <span class=\"toctext\">Space between
Text</span>]]\n* [[#Space-Entity_between_Text|<span
class=\"tocnumber\">2</span> <span class=\"toctext\">Space-Entity between
Text</span>]]\n* [[#Plus.2Bbetween.2BText|<span class=\"tocnumber\">3</span>
<span class=\"toctext\">Plus+between+Text</span>]]\n*
[[#Plus-Entity.2Bbetween.2BText|<span class=\"tocnumber\">4</span> <span
class=\"toctext\">Plus-Entity+between+Text</span>]]\n*
[[#Underscore_between_Text|<span class=\"tocnumber\">5</span> <span
class=\"toctext\">Underscore_between_Text</span>]]\n*
[[#Underscore-Entity_between_Text|<span class=\"tocnumber\">6</span> <span
class=\"toctext\">Underscore-Entity_between_Text</span>]]\n\n</div>\n\n== Space
between Text ==\nsection 1\n\n== Space-Entity between Text ==\nsection 2\n\n==
Plus+between+Text ==\nsection 3\n\n== Plus-Entity+between+Text ==\nsection
4\n\n== Underscore_between_Text ==\nsection 5\n\n==
Underscore-Entity_between_Text ==\nsection 6\n\n[[#Space_between_Text|#Space
between Text]]\n[[#Space-Entity_between_Text|#Space-Entity between
Text]]\n[[#Plus.2Bbetween.2BText|#Plus+between+Text]]\n[[#Plus-Entity.2Bbetween.2BText|#Plus-Entity+between+Text]]\n[[#Underscore_between_Text|#Underscore_between_Text]]\n[[#Underscore-Entity_between_Text|#Underscore-Entity_between_Text]]\n");
add("html2wt", "Headers with excess '=' characters\n(Are similar tests
necessary beyond the 1st level?)", "<div id=\"toc\" class=\"toc\"><div
class=\"toctitle\">\n== Contents ==\n</div>\n\n* [[#foo.3D|<span
class=\"tocnumber\">1</span> <span class=\"toctext\">foo=</span>]]\n*
[[#.3Dfoo|<span class=\"tocnumber\">2</span> <span
class=\"toctext\">=foo</span>]]\n* [[#italic_heading.3D|<span
class=\"tocnumber\">3</span> <span class=\"toctext\">''italic''
heading=</span>]]\n* [[#.3Ditalic_heading|<span class=\"tocnumber\">4</span>
<span class=\"toctext\">=''italic'' heading</span>]]\n\n</div>\n\n= foo= =\n\n=
=foo =\n\n= ''italic'' heading= =\n\n= =''italic'' heading =\n");
diff --git a/tests/parserTests.txt b/tests/parserTests.txt
index b1faa82..3d6e6a9 100644
--- a/tests/parserTests.txt
+++ b/tests/parserTests.txt
@@ -16866,21 +16866,30 @@
<p><a rel="mw:ExtLink" href="http://example.com">http://example.com</a>
<figure-inline class="mw-default-size" typeof="mw:Image"><a
href="./File:Foobar.jpg"><img resource="./File:Foobar.jpg"
src="//example.com/images/3/3a/Foobar.jpg" data-file-width="1941"
data-file-height="220" data-file-type="bitmap" height="220"
width="1941"/></a></figure-inline></p>
!!end
+# Parsoid doesn't wt2wt this cleanly because it adds <nowiki>s.
!! test
Short headings with trailing space should match behavior of Parser::doHeadings
(T21910)
+!! options
+parsoid=wt2html,html2html
!! wikitext
===
The line above must have a trailing space!
=== <!--
--> <!-- -->
But just in case it doesn't...
-!! html
+!! html/php
<h1><span class="mw-headline" id=".3D">=</span><span
class="mw-editsection"><span class="mw-editsection-bracket">[</span><a
href="/index.php?title=Parser_test&action=edit&section=1" title="Edit
section: =">edit</a><span class="mw-editsection-bracket">]</span></span></h1>
<p>The line above must have a trailing space!
</p>
<h1><span class="mw-headline" id=".3D_2">=</span><span
class="mw-editsection"><span class="mw-editsection-bracket">[</span><a
href="/index.php?title=Parser_test&action=edit&section=2" title="Edit
section: =">edit</a><span class="mw-editsection-bracket">]</span></span></h1>
<p>But just in case it doesn't...
</p>
+!! html/parsoid
+<h1 id=".3D">=</h1>
+<p>The line above must have a trailing space!</p>
+<h1 id=".3D_2">=</h1> <!--
+--> <!-- -->
+<p>But just in case it doesn't...</p>
!! end
!! test
@@ -24845,17 +24854,53 @@
!! options
parsoid=html2wt
!! html/parsoid
-<p>===
-=foo= x
+<p>=foo= x
=foo= <s></s>
</p>
!! wikitext
-===
=foo= x
=foo= <s></s>
+!! html/php
+<p>=foo= x
+=foo= <s></s>
+</p>
!!end
!! test
+Headings: 4c. Short headings (1)
+!! options
+parsoid=html2wt
+!! html/parsoid
+<p>===
+</p>
+!! wikitext
+<nowiki>===</nowiki>
+!! html/php
+<p>===
+</p>
+!! end
+
+# in the html2wt direction we emit '= = =' or '=<nowiki>=</nowiki>='
+!! test
+Headings: 4d. Short headings (2)
+!! options
+parsoid=wt2html,html2html
+!! wikitext
+===
+====
+=====
+!! html/php
+<h1><span class="mw-headline" id=".3D">=</span><span
class="mw-editsection"><span class="mw-editsection-bracket">[</span><a
href="/index.php?title=Parser_test&action=edit&section=1" title="Edit
section: =">edit</a><span class="mw-editsection-bracket">]</span></span></h1>
+<h1><span class="mw-headline" id=".3D.3D">==</span><span
class="mw-editsection"><span class="mw-editsection-bracket">[</span><a
href="/index.php?title=Parser_test&action=edit&section=2" title="Edit
section: ==">edit</a><span class="mw-editsection-bracket">]</span></span></h1>
+<h2><span class="mw-headline" id=".3D_2">=</span><span
class="mw-editsection"><span class="mw-editsection-bracket">[</span><a
href="/index.php?title=Parser_test&action=edit&section=3" title="Edit
section: =">edit</a><span class="mw-editsection-bracket">]</span></span></h2>
+
+!! html/parsoid
+<h1 id=".3D">=</h1>
+<h1 id=".3D.3D">==</h1>
+<h2 id=".3D_2">=</h2>
+!! end
+
+!! test
Headings: 5. Empty headings
!! options
parsoid=html2wt
--
To view, visit https://gerrit.wikimedia.org/r/393908
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I11926f2d2365755794d8f8f6647b1f0b02b827ab
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: C. Scott Ananian <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits