Cscott has uploaded a new change for review.
https://gerrit.wikimedia.org/r/79925
Change subject: WIP: Tweak QuoteTransformer to more closely match PHP doQuotes.
......................................................................
WIP: Tweak QuoteTransformer to more closely match PHP doQuotes.
We accumulate a chunks list alternating between quote and non-quote content,
like doQuotes does, and can look at more than one previous token's context
when deciding whether to balance quotes. We still don't match PHP's
quote parser exactly, but we get closer.
Also duplicate doQuotes algorithm for converting quotes to tags; this
avoids depending on the html tree builder and ensures that our quote
sequences match the PHP parser so that we pass parserTests.
We edit some previously-forked parserTests, now that our output is
closer to the PHP parser's output. Parsoid still leaves empty open/close
tag pairs where PHP strips them, so we need to fork a few new tests as
well.
Bug: 49926
Change-Id: I7e19565734e7ea1366b95471e46753423e208241
---
M js/lib/ext.core.QuoteTransformer.js
M js/lib/pegTokenizer.pegjs.txt
M js/tests/parserTests.txt
3 files changed, 196 insertions(+), 219 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid
refs/changes/25/79925/1
diff --git a/js/lib/ext.core.QuoteTransformer.js
b/js/lib/ext.core.QuoteTransformer.js
index c599aae..d14d939 100644
--- a/js/lib/ext.core.QuoteTransformer.js
+++ b/js/lib/ext.core.QuoteTransformer.js
@@ -21,13 +21,12 @@
QuoteTransformer.prototype.anyRank = 2.101; // Just after regular quote and
newline
QuoteTransformer.prototype.reset = function ( ) {
- // A chunk starts with a token context around a quote token and is
- // (optionally) followed by non-quote tokens. The quote token and its
- // context is later replaced with the actual tag token for italic or
bold.
- this.currentChunk = [];
- // List of chunks, each starting with a (potentially) bold or italic
token
- // and followed by plain tokens.
+ // Chunks alternate between quote tokens and sequences of non-quote
+ // tokens. The quote tokens are later replaced with the actual tag
+ // token for italic or bold. The first chunk is a non-quote chunk.
this.chunks = [];
+ // The current chunk we're accumulating into.
+ this.currentChunk = [];
// References to chunks in which the first token context / quote token
// should be converted to italic or bold tokens.
this.italics = [];
@@ -49,7 +48,6 @@
QuoteTransformer.prototype._startNewChunk = function ( ) {
this.chunks.push( this.currentChunk );
this.currentChunk = [];
- this.currentChunk.pos = this.chunks.length - 1;
};
// Handle QUOTE tags. These are collected in italic/bold lists depending on
@@ -57,17 +55,7 @@
// appropriate tag tokens is deferred until the next NEWLINE token triggers
// onNewLine.
QuoteTransformer.prototype.onQuote = function ( token, frame, prevToken ) {
- var qlen = token.value.length,
- ctx = {
- token: token,
- frame: frame,
- prevToken: prevToken
- },
- ctx2 = {
- frame: frame,
- prevToken: prevToken
- },
- tsr;
+ var qlen = token.value.length;
if ( ! this.isActive ) {
this.dispatcher.addTransform( this.onNewLine.bind(this),
"QuoteTransformer:onNewLine",
@@ -84,77 +72,21 @@
// register for any token if not yet active
this.dispatcher.addTransform( this.onAny.bind(this),
"QuoteTransformer:onAny", this.anyRank, 'any' );
this.isActive = true;
+ // add initial context to chunks (we'll get rid of it later)
+ this.currentChunk.push( prevToken ? prevToken : '' );
}
-
- this._startNewChunk();
switch (qlen) {
case 2:
- this.currentChunk.push(ctx);
- this.italics.push(this.currentChunk);
- break;
case 3:
- this.currentChunk.push(ctx);
- this.bolds.push(this.currentChunk);
+ case 5:
+ this._startNewChunk();
+ this.currentChunk.push( token );
+ this._startNewChunk();
break;
case 4:
- this.currentChunk.push( "'" );
- this._startNewChunk();
- this.currentChunk.push(ctx);
- this.bolds.push(this.currentChunk);
- break;
- case 5:
- // The order of italic vs. bold does not matter. Those
are
- // processed in a fixed order, and any nesting issues
are fixed up
- // by the HTML 5 tree builder. This does not always
result in the
- // prettiest result, but at least it is always correct
and very
- // convenient.
-
- tsr = ctx.token.dataAttribs ? ctx.token.dataAttribs.tsr
: null;
- if ( tsr ) {
- ctx.token = ctx.token.clone();
- ctx.token.dataAttribs.tsr = [tsr[0], tsr[0] +
2];
- }
- this.currentChunk.push(ctx);
- this.italics.push(this.currentChunk);
-
- // Now for the bold..
- this._startNewChunk();
- ctx2.token = {
- attribs: ctx.token.attribs
- };
- if ( tsr ) {
- // Get the correct tsr range for the bold
- ctx2.token.dataAttribs = { tsr: [tsr[1] - 3,
tsr[1]] };
- }
- this.currentChunk.push(ctx2);
- this.bolds.push(this.currentChunk);
- break;
- default: // longer than 5, only use the last 5 ticks
- var newvalue = token.value.substr(0, qlen - 5 );
- tsr = ctx.token.dataAttribs ? ctx.token.dataAttribs.tsr
: null;
- // update tsr for italic token
- if ( tsr ) {
- ctx.token = ctx.token.clone();
- ctx.token.dataAttribs.tsr = [tsr[0] + qlen - 5,
tsr[1] - 3];
- }
-
- this.currentChunk.push ( newvalue );
- this._startNewChunk();
- this.currentChunk.push(ctx);
- this.italics.push(this.currentChunk);
-
- // Now for the bold..
- this._startNewChunk();
- ctx2.token = {
- attribs: ctx.token.attribs
- };
- if ( tsr ) {
- // Get the correct tsr range for the bold
- ctx2.token.dataAttribs = { tsr: [tsr[1] - 3,
tsr[1]] };
- }
- this.currentChunk.push(ctx2);
- this.bolds.push(this.currentChunk);
+ default:
+ console.assert(false, "should be transformed by
tokenizer");
break;
}
@@ -170,51 +102,56 @@
// Handle NEWLINE tokens, which trigger the actual quote analysis on the
// collected quote tokens so far.
QuoteTransformer.prototype.onNewLine = function ( token, frame, prevToken ) {
- var res;
+ var res, qlen;
if( ! this.isActive ) {
// Nothing to do, quick abort.
return { token: token };
}
-
//token.rank = this.quoteAndNewlineRank;
- //console.warn('chunks: ' + JSON.stringify( this.chunks, null, 2 ) );
+ // count number of bold and italics
+ var numbold = 0, numitalics = 0;
+ for (var i = 1; i < this.chunks.length; i += 2) {
+ console.assert(this.chunks[i].length===1); // quote token
+ qlen = this.chunks[i][0].value.length;
+ if (qlen===2 || qlen===5) { numitalics++; }
+ if (qlen===3 || qlen===5) { numbold++; }
+ }
- //console.warn("onNewLine: " + this.italics.length + 'i/b' +
this.bolds.length);
// balance out tokens, convert placeholders into tags
- if (this.italics.length % 2 && this.bolds.length % 2) {
+ if ( (numitalics % 2 === 1) && (numbold % 2 === 1) ) {
var firstsingleletterword = -1,
firstmultiletterword = -1,
firstspace = -1;
- for (var j = 0; j < this.bolds.length; j++) {
- var ctx = this.bolds[j][0];
- var ctxPrevToken = ctx.prevToken;
- //console.warn("balancing!" +
JSON.stringify(ctxPrevToken, null, 2));
- if (ctxPrevToken) {
- if (ctxPrevToken.constructor === String) {
- var lastchar =
ctxPrevToken[ctxPrevToken.length - 1],
- secondtolastchar =
ctxPrevToken[ctxPrevToken.length - 2];
- if (lastchar === ' ' && firstspace ===
-1) {
- firstspace = j;
- } else if (lastchar !== ' ') {
- if ( secondtolastchar === ' ' &&
-
firstsingleletterword === -1)
- {
- firstsingleletterword =
j;
- } else if (
firstmultiletterword === -1) {
- firstmultiletterword =
j;
- }
- }
- } else if ( ( ctxPrevToken.constructor === NlTk
||
-
ctxPrevToken.constructor === TagTk ||
-
ctxPrevToken.constructor === SelfclosingTagTk ) &&
-
firstmultiletterword === -1 ) {
- // This is an approximation, as the
original doQuotes
- // operates on the source and just
looks at space vs.
- // non-space. At least some tags are
thus recognized as
- // words in the original implementation.
- firstmultiletterword = j;
+ for (var i = 1; i < this.chunks.length; i += 2) {
+ // only look at bold tags
+ if (this.chunks[i][0].value.length !== 3) { continue; }
+ // find the first previous token which is text
+ // (this is an approximation, since the wikitext
algorithm looks
+ // at the raw unparsed source here)
+ var prevChunk = this.chunks[i-1], ctxPrevToken = '';
+ for (var j = prevChunk.length - 1; j >= 0; j--) {
+ if (prevChunk[j].constructor === String) {
+ ctxPrevToken = prevChunk[j];
+ break;
+ }
+ }
+ //console.warn("balancing! " + ctxPrevToken);
+ var lastchar = ctxPrevToken[ctxPrevToken.length - 1],
+ secondtolastchar = ctxPrevToken[ctxPrevToken.length -
2];
+ if (lastchar === ' ' && firstspace === -1) {
+ firstspace = i;
+ } else if (lastchar !== ' ') {
+ if ( secondtolastchar === ' ' &&
+ firstsingleletterword === -1)
+ {
+ firstsingleletterword = i;
+ // if firstsingleletterword is set, we
don't need
+ // to look at the options options, so
we can bail early
+ break;
+ } else if ( firstmultiletterword === -1) {
+ firstmultiletterword = i;
}
}
}
@@ -229,27 +166,101 @@
this.convertBold(firstmultiletterword);
} else if (firstspace > -1) {
this.convertBold(firstspace);
- } else if ( !this.bolds[0][0].prevToken ) {
- // In this block, there is no previous token for the
first bold,
- // because the bold token is the first thing in the
stream.
- // In that case, we need to treat that as being the
first space,
- // basically, because the start of the string is
basically a
- // start-of-word.
- this.convertBold( 0 );
+ } else {
+ // (notice that it is possible for all three to be -1
if, for
+ // example, there is only one pentuple-apostrophe in
the line)
+ // do no balancing.
}
}
- this.quotesToTags( this.italics, 'i' );
- this.quotesToTags( this.bolds, 'b' );
-
- this.currentChunk.push( token );
- this._startNewChunk();
-
- //console.warn('chunks: ' + JSON.stringify( this.chunks, null, 2 ) );
+ // this is the same state machine as the php parser uses.
+ var lastboth = -1, state = '';
+ for (var i = 1; i < this.chunks.length; i += 2) {
+ console.assert(this.chunks[i].length === 1);
+ qlen = this.chunks[i][0].value.length;
+ if (qlen === 2) {
+ if (state === 'i') {
+ this.quoteToTag(i, [new EndTagTk( 'i' )]);
+ state = '';
+ } else if (state === 'bi') {
+ this.quoteToTag(i, [new EndTagTk( 'i' )]);
+ state = 'b';
+ } else if (state === 'ib') {
+ // annoying!
+ this.quoteToTag(i, [new EndTagTk( 'b' ), new
EndTagTk( 'i' ),
+ new
TagTk( 'b' )], "bogus two");
+ state = 'b';
+ } else if (state === 'both') {
+ this.quoteToTag(lastboth, [new TagTk( 'b' ),
new TagTk( 'i' )]);
+ this.quoteToTag(i, [new EndTagTk( 'i' )]);
+ state = 'b';
+ } else { // state can be 'b' or ''
+ this.quoteToTag(i, [new TagTk( 'i' )]);
+ state += 'i';
+ }
+ } else if (qlen === 3) {
+ if (state === 'b') {
+ this.quoteToTag(i, [new EndTagTk( 'b' )]);
+ state = '';
+ } else if (state === 'ib') {
+ this.quoteToTag(i, [new EndTagTk( 'b' )]);
+ state = 'i';
+ } else if (state === 'bi') {
+ // annoying!
+ this.quoteToTag(i, [new EndTagTk( 'i' ), new
EndTagTk( 'b' ),
+ new
TagTk( 'i' )], "bogus two");
+ state = 'i';
+ } else if (state === 'both') {
+ this.quoteToTag(lastboth, [new TagTk( 'i' ),
new TagTk( 'b' )]);
+ this.quoteToTag(i, [new EndTagTk( 'b' )]);
+ state = 'i';
+ } else { // state can be 'i' or ''
+ this.quoteToTag(i, [new TagTk( 'b' )]);
+ state += 'b';
+ }
+ } else if (qlen === 5) {
+ if (state === 'b') {
+ this.quoteToTag(i, [new EndTagTk( 'b' ), new
TagTk( 'i' )]);
+ state = 'i';
+ } else if (state === 'i') {
+ this.quoteToTag(i, [new EndTagTk( 'i' ), new
TagTk( 'b' )]);
+ state = 'b';
+ } else if (state === 'bi') {
+ this.quoteToTag(i, [new EndTagTk( 'i' ), new
EndTagTk( 'b' )]);
+ state = '';
+ } else if (state === 'ib') {
+ this.quoteToTag(i, [new EndTagTk( 'b' ), new
EndTagTk( 'i' )]);
+ state = '';
+ } else if (state === 'both') {
+ this.quoteToTag(lastboth, [new TagTk( 'i' ),
new TagTk( 'b' )]);
+ this.quoteToTag(i, [new EndTagTk( 'b' ), new
EndTagTk( 'i' )]);
+ state = '';
+ } else { // state == ''
+ lastboth = i;
+ state = 'both';
+ }
+ }
+ }
+ // now close all remaining tags. notice that order is important.
+ if ( state === 'both' ) {
+ this.quoteToTag(lastboth, [new TagTk( 'b' ), new TagTk( 'i' )]);
+ state = 'bi';
+ }
+ if ( state === 'b' || state === 'ib' ) {
+ this.currentChunk.push( new EndTagTk( 'b', [],
{autoInsertedEnd:1} ) );
+ }
+ if ( state === 'i' || state === 'bi' || state === 'ib' ) {
+ this.currentChunk.push( new EndTagTk( 'i', [],
{autoInsertedEnd:1} ) );
+ }
+ if ( state === 'bi' ) {
+ this.currentChunk.push( new EndTagTk( 'b', [],
{autoInsertedEnd:1} ) );
+ }
// return all collected tokens including the newline
+ this.currentChunk.push( token );
+ this._startNewChunk();
+ this.chunks[0].shift(); // remove 'prevToken' before first quote.
res = { tokens: Array.prototype.concat.apply([], this.chunks) };
-
// prepare for next line
this.reset();
@@ -268,67 +279,48 @@
// Convert a bold token to italic to balance an uneven number of both bold and
// italic tags. In the process, one quote needs to be converted back to text.
QuoteTransformer.prototype.convertBold = function ( i ) {
- var chunk = this.bolds[i],
- textToken = "'";
- if ( chunk.pos ) {
- this.chunks[chunk.pos].push( textToken );
- } else {
- // prepend another chunk
- this.chunks.unshift( [ textToken ] );
+ // this should be a bold tag.
+ console.assert(i > 0 && this.chunks[i].length===1 &&
+ this.chunks[i][0].value.length === 3);
+ // we're going to convert it to a single plain text ' plus an italic tag
+ this.chunks[i-1].push( "'" );
+ var oldbold = this.chunks[i][0];
+ var tsr = oldbold.dataAttribs ? oldbold.dataAttribs.tsr : null;
+ if ( tsr ) {
+ tsr = [ tsr[0]+1, tsr[1] ];
}
-
- // delete from bolds
- this.bolds.splice(i, 1);
-
- this.italics.push(chunk);
- this.italics.sort(function(a,b) { return a.pos - b.pos; } );
+ var newbold = new SelfclosingTagTk( 'mw-quote', [], { tsr: tsr });
+ newbold.value = "''"; // italic!
+ this.chunks[i] = [ newbold ];
};
// Convert italics/bolds into tags
-QuoteTransformer.prototype.quotesToTags = function ( chunks, name ) {
- var toggle = true,
- t,
- j,
- newToken,
- nameToWidth = {
- b: 3,
- i: 2
- };
-
- for (j = 0; j < chunks.length; j++) {
- //console.warn( 'quotesToTags ' + name + ': ' + JSON.stringify(
chunks, null, 2 ) );
- t = chunks[j][0].token;
- //console.warn( 'quotesToTags t: ' + JSON.stringify( t, null,
2));
-
- if(toggle) {
- newToken = new TagTk( name, t.attribs,
- // Mark last element as auto-closed
- j === chunks.length - 1 ? {
autoInsertedEnd: 1 } : {} );
- } else {
- newToken = new EndTagTk( name, t.attribs, {} );
- }
- if (t.dataAttribs && t.dataAttribs.tsr) {
- var tsr = t.dataAttribs.tsr,
- len = tsr[1] - tsr[0];
- // Verify if we the tsr value is accurate
- // SSS FIXME: We could potentially adjust tsr based on
length
- // but dont know yet whether to fix tsr[0] or tsr[1]
- if (len === nameToWidth[name]) {
- newToken.dataAttribs.tsr = Util.clone(tsr);
- } else {
- // we generally use the last quotes, so adjust
the tsr to that
- newToken.dataAttribs.tsr = [tsr[1] -
nameToWidth[name], tsr[1]];
+QuoteTransformer.prototype.quoteToTag = function( chunk, tags, ignoreBogusTwo
){
+ console.assert(this.chunks[chunk].length === 1);
+ var result = [];
+ var oldtag = this.chunks[chunk][0];
+ // make tsr
+ var tsr = oldtag.dataAttribs ? oldtag.dataAttribs.tsr : null;
+ var startpos = tsr ? tsr[0] : null, endpos = tsr ? tsr[1] : null;
+ for (var i=0; i<tags.length; i++) {
+ if (tsr) {
+ if ( i < 2 && ignoreBogusTwo ) {
+ tags[i].dataAttribs.tsr = [ startpos, startpos
];
+ tags[i].dataAttribs.autoInsertedEnd = 1;
+ } else if (tags[i].name === 'b') {
+ tags[i].dataAttribs.tsr = [ startpos, startpos
+ 3 ];
+ } else if (tags[i].name === 'i') {
+ tags[i].dataAttribs.tsr = [ startpos, startpos
+ 2 ];
+ } else { console.assert(false); }
+ if ( i === (tags.length-1) && ignoreBogusTwo ) {
+ tags[i].dataAttribs.tsr[1] = endpos;
}
+ startpos = tags[i].dataAttribs.tsr[1];
}
-
- chunks[j][0] = newToken;
-
- toggle = !toggle;
+ result.push(tags[i]);
}
- if (!toggle) {
- // Add end tag
- this.currentChunk.push( new EndTagTk( name ) );
- }
+ if (tsr) { console.assert(startpos === endpos, startpos, endpos); }
+ this.chunks[chunk] = result;
};
if (typeof module === "object") {
diff --git a/js/lib/pegTokenizer.pegjs.txt b/js/lib/pegTokenizer.pegjs.txt
index 57a09c3..7f52739 100644
--- a/js/lib/pegTokenizer.pegjs.txt
+++ b/js/lib/pegTokenizer.pegjs.txt
@@ -1207,9 +1207,23 @@
* bolds/italics and MediaWiki's special heuristics for apostrophes, which are
* all not context free. */
quote = "''" x:"'"* {
- var res = new SelfclosingTagTk( 'mw-quote', [], { tsr: [pos0, pos] } ); //
Will be consumed in token transforms
- res.value = "''" + x.join('');
- return res;
+ // sequences of four or more than five quotes are assumed to start
+ // with some number of plain-text apostrophes.
+ var quotes = "''" + x.join(''), plainticks = 0, result = [];
+ if (quotes.length === 4) {
+ plainticks = 1;
+ } else if (quotes.length > 5) {
+ plainticks = quotes.length - 5;
+ }
+ if (plainticks > 0) {
+ result.push(quotes.substring(0, plainticks));
+ }
+ // mw-quote token Will be consumed in token transforms
+ var mwq = new SelfclosingTagTk( 'mw-quote', [],
+ { tsr: [pos0 + plainticks, pos] } );
+ mwq.value = quotes.substring(plainticks);
+ result.push(mwq);
+ return result;
}
diff --git a/js/tests/parserTests.txt b/js/tests/parserTests.txt
index 71bf309..721a3cb 100644
--- a/js/tests/parserTests.txt
+++ b/js/tests/parserTests.txt
@@ -643,7 +643,7 @@
!! input
''''foo'''''
!! result
-<p>'<b>foo<i></i></b>
+<p>'<b>foo</b><i></i>
</p>
!!end
@@ -653,24 +653,12 @@
###
!! test
-Italics and bold: 5-quote opening sequence: (5,2) (php)
+Italics and bold: 5-quote opening sequence: (5,2)
!! options
-php
!! input
'''''foo''
!! result
<p><b><i>foo</i></b>
-</p>
-!!end
-# Parsoid reverses the nesting order, compared to the PHP parser
-!! test
-Italics and bold: 5-quote opening sequence: (5,2) (parsoid)
-!! options
-parsoid
-!! input
-'''''foo''
-!! result
-<p><i><b>foo</b></i>
</p>
!!end
@@ -815,30 +803,13 @@
!!end
-# The Parsoid team believes the PHP parser's output on this test is wrong.
-# It only checks for convert-to-bold-on-single-character-word when the word
-# matches with a bold tag ("'''") that is *odd* in the list of quote tokens.
-# This means that the bold token in position 2 (0-indexed) gets converted by
-# parsoid, but doesn't get changed by the PHP parser.
!! test
-Italics and bold: other quote tests: (3,2,3,3) (php)
+Italics and bold: other quote tests: (3,2,3,3)
!! options
-php
!! input
'''this is about ''foo'''s family'''
!! result
<p>'<i>this is about </i>foo<b>s family</b>
-</p>
-!!end
-# This is the output the Parsoid team believes to be correct.
-!! test
-Italics and bold: other quote tests: (3,2,3,3) (parsoid)
-!! options
-parsoid
-!! input
-'''this is about ''foo'''s family'''
-!! result
-<p><b>this is about <i>foo'</i>s family</b>
</p>
!!end
--
To view, visit https://gerrit.wikimedia.org/r/79925
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I7e19565734e7ea1366b95471e46753423e208241
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Cscott <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits