https://www.mediawiki.org/wiki/Special:Code/MediaWiki/113020
Revision: 113020
Author: gwicke
Date: 2012-03-05 12:00:38 +0000 (Mon, 05 Mar 2012)
Log Message:
-----------
Change wikilink tokenization strategy to split on pipes. This makes it
possible to support template / template argument expansion in image options,
and causes little trouble for wikilinks. Non-image wikilinks with multiple
text pipes are quite rare in the dumps, and concatenating description tokens
with a plain '|' is quite easy. 261 parser tests passing.
Modified Paths:
--------------
trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js
trunk/extensions/VisualEditor/modules/parser/ext.core.ParserFunctions.js
trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js
trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js
trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js
trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js
trunk/extensions/VisualEditor/tests/parser/parserTests.js
Modified: trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js
===================================================================
--- trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js
2012-03-05 11:53:33 UTC (rev 113019)
+++ trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js
2012-03-05 12:00:38 UTC (rev 113020)
@@ -24,12 +24,12 @@
WikiLinkHandler.prototype.rank = 1.15; // after AttributeExpander
WikiLinkHandler.prototype.onWikiLink = function ( token, manager, cb ) {
- var env = this.manager.env;
+ var env = this.manager.env,
+ href = env.lookupKV( token.attribs, 'href' ).v,
+ tail = env.lookupKV( token.attribs, 'tail' ).v;
var title = this.manager.env.makeTitleFromPrefixedText(
- env.tokensToString(
- env.lookupKV( token.attribs, 'href' ).v
- )
- );
+ env.tokensToString( href )
+ );
if ( title.ns.isFile() ) {
return this.renderFile( token, manager, cb, title );
@@ -39,16 +39,65 @@
} else {
// Check if page exists
//
- var obj = new TagTk( 'a', [ this.manager.env.lookupKV(
token.attribs, 'href' ) ] );
+ //console.warn( 'title: ' + JSON.stringify( title ) );
+ var obj = new TagTk( 'a', [ new KV( 'href', title.makeLink() )
] ),
+ content = this.manager.env.lookupKV( token.attribs,
'content' ).v;
+ //console.warn('content: ' + JSON.stringify( content, null, 2 )
);
+ // XXX: handle trail
+ if ( content.length ) {
+ var out = []
+ for ( var i = 0, l = content.length; i < l ; i++ ) {
+ out = out.concat( content[i] );
+ if ( i < l - 1 ) {
+ out.push( '|' );
+ }
+ }
+ content = out;
+ } else {
+ content = href;
+ }
+ if ( tail ) {
+ content.push( tail );
+ }
+
obj.attribs.push( new KV('data-mw-type', 'internal') );
- var out = [obj].concat( this.manager.env.lookupKV(
token.attribs, 'content' ).v,
- new EndTagTk(
'a' ) );
+ var out = [obj].concat( content, new EndTagTk( 'a' ) );
//console.warn( JSON.stringify( out, null, 2 ) );
return { tokens: out };
}
};
+WikiLinkHandler.prototype._simpleImageOptions = {
+ // halign
+ 'left': 'halign',
+ 'right': 'halign',
+ 'center': 'halign',
+ 'none': 'halign',
+ // valign
+ 'baseline': 'valign',
+ 'sub': 'valign',
+ 'super': 'valign',
+ 'top': 'valign',
+ 'text-top': 'valign',
+ 'middle': 'valign',
+ 'bottom': 'valign',
+ 'text-bottom': 'valign',
+ // format
+ 'border': 'format',
+ 'frameless': 'format',
+ 'frame': 'format',
+ 'thumbnail': 'format',
+ 'thumb': 'format'
+};
+WikiLinkHandler.prototype._prefixImageOptions = {
+ 'link': 'link',
+ 'alt': 'alt',
+ 'page': 'page',
+ 'thumbnail': 'thumb',
+ 'thumb': 'thumb'
+};
+
WikiLinkHandler.prototype.renderFile = function ( token, manager, cb, title ) {
var env = manager.env;
// distinguish media types
@@ -67,15 +116,39 @@
[ hash[0], hash.substr(0, 2) ].join('/') + '/' +
title.key;
- // XXX: parse options
- var contentPos = token.dataAttribs.contentPos;
- var optionSource = token.source.substr( contentPos[0], contentPos[1] -
contentPos[0] );
- console.log( 'optionSource: ' + optionSource );
+
+ // XXX: extract options
+ var options = [],
+ caption = null;
+ for( var i = 0, l = content.length; i<l; i++ ) {
+ var oContent = content[i],
+ oText = manager.env.tokensToString( oContent, true );
+ if ( oText.constructor === String ) {
+ var oText = oText.trim();
+ if ( this._simpleImageOptions[ oText ] ) {
+ options.push( new KV( this._simpleImageOptions[
oText ],
+ oText ) );
+ continue;
+ }
+ } else {
+ var bits = oText[0].split( '=', 2 );
+ if ( bits.length > 1 && this._prefixImageOptions[
bits[0].strip ] ) {
+ console.log('handle prefix ' + bits );
+ } else {
+ caption = oContent;
+ }
+ }
+ }
+
+
+ //var contentPos = token.dataAttribs.contentPos;
+ //var optionSource = token.source.substr( contentPos[0], contentPos[1]
- contentPos[0] );
+ //console.log( 'optionSource: ' + optionSource );
// XXX: The trouble with re-parsing is the need to re-expand templates.
- // Figure out often non-image links contain image-like parameters!
- var options = this.imageParser.processImageOptions( optionSource );
+ // Figure out how often non-image links contain image-like parameters!
+ //var options = this.imageParser.processImageOptions( optionSource );
//console.log( JSON.stringify( options, null, 2 ) );
- // XXX: check if the file exists, generate thumbnail
+ // XXX: check if the file exists, generate thumbnail, get size
// XXX: render according to mode (inline, thumb, framed etc)
var img = new SelfclosingTagTk( 'img',
[
Modified:
trunk/extensions/VisualEditor/modules/parser/ext.core.ParserFunctions.js
===================================================================
--- trunk/extensions/VisualEditor/modules/parser/ext.core.ParserFunctions.js
2012-03-05 11:53:33 UTC (rev 113019)
+++ trunk/extensions/VisualEditor/modules/parser/ext.core.ParserFunctions.js
2012-03-05 12:00:38 UTC (rev 113020)
@@ -366,7 +366,11 @@
};
ParserFunctions.prototype['pf_localurl'] = function ( target, argList, argDict
) {
- return ( this.manager.env.wgScriptPath + 'index' +
+ return (
+ '/' +
+ // FIXME! Figure out correct prefix to use
+ //this.manager.env.wgScriptPath +
+ 'index' +
this.manager.env.wgScriptExtension + '?title=' +
this.manager.env.normalizeTitle( target ) + '&'
+
argList.map(
Modified: trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js
===================================================================
--- trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js
2012-03-05 11:53:33 UTC (rev 113019)
+++ trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js
2012-03-05 12:00:38 UTC (rev 113020)
@@ -12,7 +12,13 @@
if ( false && this.nskey ) {
return this.env.wgScriptPath + this.nskey + ':' + this.key;
} else {
- return this.env.wgScriptPath + [this.ns.getDefaultName(),
this.key].join(':');
+ var l = this.env.wgScriptPath,
+ ns = this.ns.getDefaultName();
+
+ if ( ns ) {
+ l += ns + ':';
+ }
+ return l + this.key;
}
};
Modified:
trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js
===================================================================
---
trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js
2012-03-05 11:53:33 UTC (rev 113019)
+++
trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js
2012-03-05 12:00:38 UTC (rev 113020)
@@ -117,7 +117,7 @@
return new Title( text, 0, '', this );
}
} else {
- return new Title( text, 0, this );
+ return new Title( text, 0, '', this );
}
};
@@ -174,7 +174,7 @@
return name;
};
-MWParserEnvironment.prototype.tokensToString = function ( tokens ) {
+MWParserEnvironment.prototype.tokensToString = function ( tokens, strict ) {
var out = [];
//console.warn( 'MWParserEnvironment.tokensToString, tokens: ' +
JSON.stringify( tokens ) );
// XXX: quick hack, track down non-array sources later!
@@ -195,6 +195,9 @@
} else if ( token.type === 'COMMENT' || token.type ===
'NEWLINE' ) {
// strip comments and newlines
} else {
+ if ( strict ) {
+ return [out.join(''), null];
+ }
var tstring = JSON.stringify( token );
this.dp ( 'MWParserEnvironment.tokensToString, non-text
token: ' +
tstring + JSON.stringify( tokens, null,
2 ) );
Modified:
trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js
===================================================================
--- trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js
2012-03-05 11:53:33 UTC (rev 113019)
+++ trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js
2012-03-05 12:00:38 UTC (rev 113020)
@@ -132,6 +132,7 @@
},
'|': function ( input, pos, syntaxFlags ) {
return syntaxFlags.template ||
+ syntaxFlags.linkdesc ||
( syntaxFlags.table &&
( input[pos + 1].match(/[|}]/) !== null ||
syntaxFlags.tableCellArg
Modified: trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
===================================================================
--- trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
2012-03-05 11:53:33 UTC (rev 113019)
+++ trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
2012-03-05 12:00:38 UTC (rev 113020)
@@ -770,14 +770,14 @@
}
/ {
- return { pos: posStack.pop('lcontent' , pos), content: null };
+ return { pos: posStack.pop('lcontent' , pos), content: [] };
}
)
"]]"
// XXX In real MediaWiki, this is a language-dependent positive character
// class. Can we work out a static negative class instead?
// XXX: Exclude uppercase chars from non-latin languages too!
- trail:( ![A-Z \t(),.:-] tc:text_char { return tc } )* {
+ tail:( ![A-Z \t(),.:-] tc:text_char { return tc } )* {
var obj = new SelfclosingTagTk( 'wikilink' ),
textTokens = [];
obj.attribs.push( new KV('href', target) );
@@ -788,22 +788,12 @@
// XXX: Point to object with path, revision and input information
obj.source = input;
+ //console.warn('lcontent: ' + JSON.stringify( lcontent, null, 2 ) );
// Deal with content. XXX: Properly support pipe-trick etc
- if (lcontent.content && lcontent.content.length) {
- textTokens = lcontent.content;
- if (trail) {
- textTokens.push( trail.join('') );
- }
- } else {
- if (trail) {
- textTokens = $.extend(true, [], target).concat( [ trail.join('')
] );
- } else {
- // copy list
- textTokens = $.extend(true, [], target);
- }
- }
+ lcontent.tail = tail && tail.join('') || '';
- obj.attribs.push( new KV( 'content', flatten( textTokens ) ) );
+ obj.attribs.push( new KV( 'content', lcontent.content ) );
+ obj.attribs.push( new KV( 'tail', lcontent.tail ) );
//console.warn( "XXX:" + pp([obj].concat(textTokens, [new EndTagTk( 'a'
)])) );
return [obj];
}
@@ -826,6 +816,24 @@
}
/ & { return clearFlag('linkdesc'); }
+link_option
+ = & { setFlag('pipe'); return setFlag('linkdesc'); }
+ h:inline
+ // 'equal' syntaxFlag is set for links in template parameters. Consume the
+ // '=' here.
+ hs:( '=' inline)?
+ {
+ //console.warn('link_text' + pp(h) + pp(hs));
+ clearFlag('pipe');
+ clearFlag('linkdesc');
+ if( hs !== '' ) {
+ return h.concat(hs);
+ } else {
+ return h;
+ }
+ }
+ / & { clearFlag('pipe'); return clearFlag('linkdesc'); }
+
link_end = "]]"
/* Generic quote production for italic and bold, further processed in a token
@@ -1674,7 +1682,7 @@
= r:( t:[^%<~[{\n\r\t|!\]} &=]+ { return t.join(''); }
/ urlencoded_char
/ directive
- / !inline_breaks !"]]" text_char )+ {
+ / !inline_breaks !"|" !"]]" text_char )+ {
return flatten_stringlist ( r );
}
Modified: trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js
===================================================================
--- trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js
2012-03-05 11:53:33 UTC (rev 113019)
+++ trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js
2012-03-05 12:00:38 UTC (rev 113020)
@@ -13,7 +13,7 @@
testWhiteList["Unclosed and unmatched quotes"] = "<p
data-mw-sourcePos=\"0:66\"><i><b>Bold italic text </b>with bold deactivated<b>
in between.</b></i></p><p><i><b>Bold italic text </b></i><b>with italic
deactivated<i> in between.</i></b></p><p><b>Bold text..</b></p><p>..spanning
two paragraphs (should not work).<b></b></p><p><b>Bold tag left
open</b></p><p><i>Italic tag left open</i></p><p>Normal text.<!-- Unmatching
number of opening, closing tags: -->\n</p><p><b>This year'</b>s election
<i>should</i> beat <b>last year'</b>s.</p><p><i>Tom<b>s car is bigger than
</b></i><b>Susan</b>s.</p>";
// The expected result for this test is really broken html.
-testWhiteList["Link containing double-single-quotes '' in text embedded in
italics (bug 4598 sanity check)"] = "<p data-mw-sourcePos=\"0:45\"><i>Some <a
data-mw-type=\"internal\" href=\"Link\">pretty </a></i><a
data-mw-type=\"internal\" href=\"Link\">italics<i> and
stuff</i></a><i>!</i></p>";
+testWhiteList["Link containing double-single-quotes '' in text embedded in
italics (bug 4598 sanity check)"] = "<p data-mw-sourcePos=\"0:45\"><i>Some <a
data-mw-type=\"internal\" href=\"/wiki/Link\">pretty </a></i><a
data-mw-type=\"internal\" href=\"/wiki/Link\">italics<i> and
stuff</i></a><i>!</i></p>";
testWhiteList["External link containing double-single-quotes in text embedded
in italics (bug 4598 sanity check)"] = "<p><i>Some <a
href=\"http://example.com/\">pretty </a></i><a
href=\"http://example.com/\">italics<i> and stuff</i></a><i>!</i></p>";
@@ -47,9 +47,9 @@
/* Missing token transform functionality */
// We don't implement percent encoding for URIs yet.
-testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a
data-mw-type=\"internal\" href=\"Lista d''e paise d''o munno\">Lista d''e paise
d''o munno</a></p>";
+testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a
data-mw-type=\"internal\" href=\"/wiki/Lista d''e paise d''o munno\">Lista d''e
paise d''o munno</a></p>";
-testWhiteList["Link containing \"<#\" and \">#\" as a hex sequences"] = "<p><a
data-mw-type=\"internal\" href=\"<%23\"><%23</a><a
data-mw-type=\"internal\" href=\">%23\">>%23</a></p>";
+testWhiteList["Link containing \"<#\" and \">#\" as a hex sequences"] = "<p><a
data-mw-type=\"internal\" href=\"/wiki/<%23\"><%23</a><a
data-mw-type=\"internal\" href=\"/wiki/>%23\">>%23</a></p>";
// Sanitizer
@@ -57,17 +57,23 @@
testWhiteList["Table security: embedded pipes
(http://lists.wikimedia.org/mailman/htdig/wikitech-l/2006-April/022293.html)"]
= "<table><tbody><tr><td> |<a href=\"ftp://|x||\">[1]</a>\"
onmouseover=\"alert(document.cookie)\">test</td></tr></tbody></table>";
// Sanitizer, but UTF8 in link might actually be ok in HTML5
-testWhiteList["External link containing double-single-quotes with no space
separating the url from text in italics"] = "<p><a
href=\"http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm\"><i>La muerte
de Casagemas</i> (1901) en el sitio de </a><a data-mw-type=\"internal\"
href=\"Museo Picasso (París)\">Museo Picasso</a>.</p>";
+testWhiteList["External link containing double-single-quotes with no space
separating the url from text in italics"] = "<p><a
href=\"http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm\"
data-mw-type=\"external\" data-mw-rt=\"{"sourcePos":[0,146]}\"><i>La
muerte de Casagemas</i> (1901) en el sitio de </a><a
href=\"/wiki/Museo_Picasso_(París)\" data-mw-type=\"internal\">Museo
Picasso</a>.</p>";
// plain percent sign is also valid in HTML5
testWhiteList["Bug 4781, 5267: %28, %29 in URL"] = "<p><a
href=\"http://www.example.com/?title=Ben-Hur_(1959_film)\"
data-mw-sourcePos=\"0:53\">http://www.example.com/?title=Ben-Hur_(1959_film)</a></p>";
-testWhiteList["External links: wiki links within external link (Bug 3695)"] =
"<p><a href=\"http://example.com\" data-mw-type=\"external\"
data-mw-sourcePos=\"0:54\"></a><a data-mw-type=\"internal\"
href=\"wikilink\">wikilink</a> embedded in ext link</p>";
+testWhiteList["External links: wiki links within external link (Bug 3695)"] =
"<p><a href=\"http://example.com\" data-mw-type=\"external\"
data-mw-sourcePos=\"0:54\"></a><a data-mw-type=\"internal\"
href=\"/wiki/Wikilink\">wikilink</a> embedded in ext link</p>";
testWhiteList["Bug 4781, 5267: %25 in URL"] = "<p><a
href=\"http://www.example.com/?title=100%_Bran\"
data-mw-sourcePos=\"0:41\">http://www.example.com/?title=100%_Bran</a></p>";
testWhiteList["<pre> with forbidden attribute values (bug 3202)"] = "<pre
width=\"8\" style=\"\">Narrow screen goodies</pre>";
+testWhiteList["Link containing % (not as a hex sequence)"] = "<p><a
href=\"/wiki/7%_Solution\" data-mw-type=\"internal\">7% Solution</a></p>";
+
+testWhiteList["Link containing % as a single hex sequence interpreted to
char"] = "<p><a href=\"/wiki/7%_Solution\" data-mw-type=\"internal\">7%
Solution</a></p>";
+
+testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a
href=\"/wiki/Lista_d''e_paise_d''o_munno\" data-mw-type=\"internal\">Lista d''e
paise d''o munno</a></p>";
+
if (typeof module == "object") {
module.exports.testWhiteList = testWhiteList;
}
Modified: trunk/extensions/VisualEditor/tests/parser/parserTests.js
===================================================================
--- trunk/extensions/VisualEditor/tests/parser/parserTests.js 2012-03-05
11:53:33 UTC (rev 113019)
+++ trunk/extensions/VisualEditor/tests/parser/parserTests.js 2012-03-05
12:00:38 UTC (rev 113020)
@@ -202,7 +202,7 @@
fetchTemplates: false,
debug: this.argv.debug,
trace: this.argv.trace,
- wgScriptPath: '/'
+ wgScriptPath: '/wiki/'
});
}
@@ -308,7 +308,7 @@
return this.htmlparser.document.getElementsByTagName('body')[0]
.innerHTML
// a few things we ignore for now..
- .replace(/\/wiki\/Main_Page/g, 'Main Page')
+ //.replace(/\/wiki\/Main_Page/g, 'Main Page')
// do not expect a toc for now
.replace(/<table[^>]+?id="toc"[^>]*>.+?<\/table>/mg, '')
// do not expect section editing for now
@@ -316,7 +316,7 @@
// general class and titles, typically on links
.replace(/(title|class|rel)="[^"]+"/g, '')
// strip red link markup, we do not check if a page
exists yet
-
.replace(/\/index.php\?title=([^']+)&action=edit&redlink=1/g, '$1')
+
.replace(/\/index.php\?title=([^']+?)&action=edit&redlink=1/g,
'/wiki/$1')
// the expected html has some extra space in tags,
strip it
.replace(/<a +href/g, '<a href')
.replace(/" +>/g, '">');
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs