https://www.mediawiki.org/wiki/Special:Code/MediaWiki/113020

Revision: 113020
Author:   gwicke
Date:     2012-03-05 12:00:38 +0000 (Mon, 05 Mar 2012)
Log Message:
-----------
Change wikilink tokenization strategy to split on pipes. This makes it
possible to support template / template argument expansion in image options,
and causes little trouble for wikilinks. Non-image wikilinks with multiple
text pipes are quite rare in the dumps, and concatenating description tokens
with a plain '|' is quite easy. 261 parser tests passing.

Modified Paths:
--------------
    trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js
    trunk/extensions/VisualEditor/modules/parser/ext.core.ParserFunctions.js
    trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js
    trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js
    trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js
    trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
    trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js
    trunk/extensions/VisualEditor/tests/parser/parserTests.js

Modified: trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js
===================================================================
--- trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js        
2012-03-05 11:53:33 UTC (rev 113019)
+++ trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js        
2012-03-05 12:00:38 UTC (rev 113020)
@@ -24,12 +24,12 @@
 WikiLinkHandler.prototype.rank = 1.15; // after AttributeExpander
 
 WikiLinkHandler.prototype.onWikiLink = function ( token, manager, cb ) {
-       var env = this.manager.env;
+       var env = this.manager.env,
+               href = env.lookupKV( token.attribs, 'href' ).v,
+               tail = env.lookupKV( token.attribs, 'tail' ).v;
        var title = this.manager.env.makeTitleFromPrefixedText( 
-                               env.tokensToString(
-                                       env.lookupKV( token.attribs, 'href' ).v 
-                               )
-                       );
+                                       env.tokensToString( href )
+                               );
 
        if ( title.ns.isFile() ) {
                return this.renderFile( token, manager, cb, title );
@@ -39,16 +39,65 @@
        } else {
                // Check if page exists
                // 
-               var obj = new TagTk( 'a', [ this.manager.env.lookupKV( 
token.attribs, 'href' ) ] );
+               //console.warn( 'title: ' + JSON.stringify( title ) );
+               var obj = new TagTk( 'a', [ new KV( 'href', title.makeLink() ) 
] ),
+                       content = this.manager.env.lookupKV( token.attribs, 
'content' ).v;
+               //console.warn('content: ' + JSON.stringify( content, null, 2 ) 
);
+               // XXX: handle trail
+               if ( content.length ) {
+                       var out = []
+                       for ( var i = 0, l = content.length; i < l ; i++ ) {
+                               out = out.concat( content[i] );
+                               if ( i < l - 1 ) {
+                                       out.push( '|' );
+                               }
+                       }
+                       content = out;
+               } else {
+                       content = href;
+               }
+               if ( tail ) {
+                       content.push( tail );
+               }
+               
                obj.attribs.push( new KV('data-mw-type', 'internal') );
-               var out = [obj].concat( this.manager.env.lookupKV( 
token.attribs, 'content' ).v, 
-                                                               new EndTagTk( 
'a' ) );
+               var out = [obj].concat( content, new EndTagTk( 'a' ) );
                //console.warn( JSON.stringify( out, null, 2 ) );
                return { tokens: out };
        }
 };
 
+WikiLinkHandler.prototype._simpleImageOptions = {
+       // halign
+       'left': 'halign',
+       'right': 'halign',
+       'center': 'halign',
+       'none': 'halign',
+       // valign
+       'baseline': 'valign',
+       'sub': 'valign',
+       'super': 'valign',
+       'top': 'valign',
+       'text-top': 'valign',
+       'middle': 'valign',
+       'bottom': 'valign',
+       'text-bottom': 'valign',
+       // format
+       'border': 'format',
+       'frameless': 'format',
+       'frame': 'format',
+       'thumbnail': 'format',
+       'thumb': 'format'
+};
 
+WikiLinkHandler.prototype._prefixImageOptions = {
+       'link': 'link',
+       'alt': 'alt',
+       'page': 'page',
+       'thumbnail': 'thumb',
+       'thumb': 'thumb'
+};
+
 WikiLinkHandler.prototype.renderFile = function ( token, manager, cb, title ) {
        var env = manager.env;
        // distinguish media types
@@ -67,15 +116,39 @@
                        [ hash[0], hash.substr(0, 2) ].join('/') + '/' + 
title.key;
        
        
-       // XXX: parse options
-       var contentPos = token.dataAttribs.contentPos;
-       var optionSource = token.source.substr( contentPos[0], contentPos[1] - 
contentPos[0] );
-       console.log( 'optionSource: ' + optionSource );
+
+       // XXX: extract options
+       var options = [],
+               caption = null;
+       for( var i = 0, l = content.length; i<l; i++ ) {
+               var oContent = content[i],
+                       oText = manager.env.tokensToString( oContent, true );
+               if ( oText.constructor === String ) {
+                       var oText = oText.trim();
+                       if ( this._simpleImageOptions[ oText ] ) {
+                               options.push( new KV( this._simpleImageOptions[ 
oText ], 
+                                                       oText ) );
+                               continue;
+                       } 
+               } else {
+                       var bits = oText[0].split( '=', 2 );
+                       if ( bits.length > 1 && this._prefixImageOptions[ 
bits[0].strip ] ) {
+                               console.log('handle prefix ' + bits );
+                       } else {
+                               caption = oContent;
+                       }
+               }
+       }
+       
+
+       //var contentPos = token.dataAttribs.contentPos;
+       //var optionSource = token.source.substr( contentPos[0], contentPos[1] 
- contentPos[0] );
+       //console.log( 'optionSource: ' + optionSource );
        // XXX: The trouble with re-parsing is the need to re-expand templates.
-       // Figure out often non-image links contain image-like parameters!
-       var options = this.imageParser.processImageOptions( optionSource );
+       // Figure out how often non-image links contain image-like parameters!
+       //var options = this.imageParser.processImageOptions( optionSource );
        //console.log( JSON.stringify( options, null, 2 ) );
-       // XXX: check if the file exists, generate thumbnail
+       // XXX: check if the file exists, generate thumbnail, get size
        // XXX: render according to mode (inline, thumb, framed etc)
        var img = new SelfclosingTagTk( 'img', 
                        [ 

Modified: 
trunk/extensions/VisualEditor/modules/parser/ext.core.ParserFunctions.js
===================================================================
--- trunk/extensions/VisualEditor/modules/parser/ext.core.ParserFunctions.js    
2012-03-05 11:53:33 UTC (rev 113019)
+++ trunk/extensions/VisualEditor/modules/parser/ext.core.ParserFunctions.js    
2012-03-05 12:00:38 UTC (rev 113020)
@@ -366,7 +366,11 @@
 };
 
 ParserFunctions.prototype['pf_localurl'] = function ( target, argList, argDict 
) {
-       return ( this.manager.env.wgScriptPath + 'index' +
+       return ( 
+                       '/' +
+                       // FIXME! Figure out correct prefix to use
+                       //this.manager.env.wgScriptPath + 
+                       'index' +
                                this.manager.env.wgScriptExtension + '?title=' +
                                this.manager.env.normalizeTitle( target ) + '&' 
+
                                argList.map( 

Modified: trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js
===================================================================
--- trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js     
2012-03-05 11:53:33 UTC (rev 113019)
+++ trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js     
2012-03-05 12:00:38 UTC (rev 113020)
@@ -12,7 +12,13 @@
        if ( false && this.nskey ) {
                return this.env.wgScriptPath + this.nskey + ':' + this.key;
        } else {
-               return this.env.wgScriptPath + [this.ns.getDefaultName(), 
this.key].join(':');
+               var l = this.env.wgScriptPath,
+                       ns = this.ns.getDefaultName();
+
+               if ( ns ) {
+                       l += ns + ':';
+               }
+               return l + this.key;
        }
 };
 

Modified: 
trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js
===================================================================
--- 
trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js    
    2012-03-05 11:53:33 UTC (rev 113019)
+++ 
trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js    
    2012-03-05 12:00:38 UTC (rev 113020)
@@ -117,7 +117,7 @@
                        return new Title( text, 0, '', this );
                }
        } else {
-               return new Title( text, 0, this );
+               return new Title( text, 0, '', this );
        }
 };
 
@@ -174,7 +174,7 @@
        return name;
 };
 
-MWParserEnvironment.prototype.tokensToString = function ( tokens ) {
+MWParserEnvironment.prototype.tokensToString = function ( tokens, strict ) {
        var out = [];
        //console.warn( 'MWParserEnvironment.tokensToString, tokens: ' + 
JSON.stringify( tokens ) );
        // XXX: quick hack, track down non-array sources later!
@@ -195,6 +195,9 @@
                } else if ( token.type === 'COMMENT' || token.type === 
'NEWLINE' ) {
                        // strip comments and newlines
                } else {
+                       if ( strict ) {
+                               return [out.join(''), null];
+                       }
                        var tstring = JSON.stringify( token );
                        this.dp ( 'MWParserEnvironment.tokensToString, non-text 
token: ' + 
                                        tstring + JSON.stringify( tokens, null, 
2 ) );

Modified: 
trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js
===================================================================
--- trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js     
2012-03-05 11:53:33 UTC (rev 113019)
+++ trunk/extensions/VisualEditor/modules/parser/mediawiki.tokenizer.peg.js     
2012-03-05 12:00:38 UTC (rev 113020)
@@ -132,6 +132,7 @@
        },
        '|': function ( input, pos, syntaxFlags ) {
                return syntaxFlags.template ||
+                          syntaxFlags.linkdesc ||
                        ( syntaxFlags.table &&
                          ( input[pos + 1].match(/[|}]/) !== null ||
                                syntaxFlags.tableCellArg

Modified: trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
===================================================================
--- trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt 
2012-03-05 11:53:33 UTC (rev 113019)
+++ trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt 
2012-03-05 12:00:38 UTC (rev 113020)
@@ -770,14 +770,14 @@
                }
             
              / { 
-                 return { pos: posStack.pop('lcontent' , pos), content: null };
+                 return { pos: posStack.pop('lcontent' , pos), content: [] };
                }
              )
     "]]" 
     // XXX In real MediaWiki, this is a language-dependent positive character
     // class. Can we work out a static negative class instead?
     // XXX: Exclude uppercase chars from non-latin languages too!
-    trail:( ![A-Z \t(),.:-] tc:text_char { return tc } )* {
+    tail:( ![A-Z \t(),.:-] tc:text_char { return tc } )* {
       var obj = new SelfclosingTagTk( 'wikilink' ),
           textTokens = [];
       obj.attribs.push( new KV('href', target) );
@@ -788,22 +788,12 @@
       // XXX: Point to object with path, revision and input information
       obj.source = input;
 
+      //console.warn('lcontent: ' + JSON.stringify( lcontent, null, 2 ) );
       // Deal with content. XXX: Properly support pipe-trick etc
-      if (lcontent.content && lcontent.content.length) {
-          textTokens = lcontent.content;
-          if (trail) {
-            textTokens.push( trail.join('') );
-          }
-      } else {
-          if (trail) {
-              textTokens = $.extend(true, [], target).concat( [ trail.join('') 
] );
-          } else {
-              // copy list
-              textTokens = $.extend(true, [], target);
-          }
-      }
+      lcontent.tail = tail && tail.join('') || '';
 
-      obj.attribs.push( new KV( 'content', flatten( textTokens ) ) );
+      obj.attribs.push( new KV( 'content', lcontent.content ) );
+      obj.attribs.push( new KV( 'tail', lcontent.tail ) );
       //console.warn( "XXX:" + pp([obj].concat(textTokens, [new EndTagTk( 'a' 
)])) );
       return [obj];
   }
@@ -826,6 +816,24 @@
     }
   / & { return clearFlag('linkdesc'); }
 
+link_option
+  = & { setFlag('pipe'); return setFlag('linkdesc'); }
+    h:inline 
+    // 'equal' syntaxFlag is set for links in template parameters. Consume the
+    // '=' here.
+    hs:( '=' inline)?
+    { 
+        //console.warn('link_text' + pp(h) + pp(hs));
+        clearFlag('pipe');
+        clearFlag('linkdesc');
+        if( hs !== '' ) {
+            return h.concat(hs);
+        } else {
+            return h;
+        }
+    }
+  / & { clearFlag('pipe'); return clearFlag('linkdesc'); }
+
 link_end = "]]"
 
 /* Generic quote production for italic and bold, further processed in a token
@@ -1674,7 +1682,7 @@
   = r:( t:[^%<~[{\n\r\t|!\]} &=]+ { return t.join(''); }
   / urlencoded_char
   / directive
-  / !inline_breaks !"]]" text_char )+ {
+  / !inline_breaks !"|" !"]]" text_char )+ {
       return flatten_stringlist ( r );
   }
 

Modified: trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js
===================================================================
--- trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js 
2012-03-05 11:53:33 UTC (rev 113019)
+++ trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js 
2012-03-05 12:00:38 UTC (rev 113020)
@@ -13,7 +13,7 @@
 testWhiteList["Unclosed and unmatched quotes"] = "<p 
data-mw-sourcePos=\"0:66\"><i><b>Bold italic text </b>with bold deactivated<b> 
in between.</b></i></p><p><i><b>Bold italic text </b></i><b>with italic 
deactivated<i> in between.</i></b></p><p><b>Bold text..</b></p><p>..spanning 
two paragraphs (should not work).<b></b></p><p><b>Bold tag left 
open</b></p><p><i>Italic tag left open</i></p><p>Normal text.<!-- Unmatching 
number of opening, closing tags: -->\n</p><p><b>This year'</b>s election 
<i>should</i> beat <b>last year'</b>s.</p><p><i>Tom<b>s car is bigger than 
</b></i><b>Susan</b>s.</p>";
 
 // The expected result for this test is really broken html.
-testWhiteList["Link containing double-single-quotes '' in text embedded in 
italics (bug 4598 sanity check)"] = "<p data-mw-sourcePos=\"0:45\"><i>Some <a 
data-mw-type=\"internal\" href=\"Link\">pretty </a></i><a 
data-mw-type=\"internal\" href=\"Link\">italics<i> and 
stuff</i></a><i>!</i></p>";
+testWhiteList["Link containing double-single-quotes '' in text embedded in 
italics (bug 4598 sanity check)"] = "<p data-mw-sourcePos=\"0:45\"><i>Some <a 
data-mw-type=\"internal\" href=\"/wiki/Link\">pretty </a></i><a 
data-mw-type=\"internal\" href=\"/wiki/Link\">italics<i> and 
stuff</i></a><i>!</i></p>";
 
 testWhiteList["External link containing double-single-quotes in text embedded 
in italics (bug 4598 sanity check)"] = "<p><i>Some <a 
href=\"http://example.com/\";>pretty </a></i><a 
href=\"http://example.com/\";>italics<i> and stuff</i></a><i>!</i></p>";
 
@@ -47,9 +47,9 @@
 /* Missing token transform functionality */
 
 // We don't implement percent encoding for URIs yet.
-testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a 
data-mw-type=\"internal\" href=\"Lista d''e paise d''o munno\">Lista d''e paise 
d''o munno</a></p>";
+testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a 
data-mw-type=\"internal\" href=\"/wiki/Lista d''e paise d''o munno\">Lista d''e 
paise d''o munno</a></p>";
 
-testWhiteList["Link containing \"<#\" and \">#\" as a hex sequences"] = "<p><a 
data-mw-type=\"internal\" href=\"&lt;%23\">&lt;%23</a><a 
data-mw-type=\"internal\" href=\"&gt;%23\">&gt;%23</a></p>";
+testWhiteList["Link containing \"<#\" and \">#\" as a hex sequences"] = "<p><a 
data-mw-type=\"internal\" href=\"/wiki/&lt;%23\">&lt;%23</a><a 
data-mw-type=\"internal\" href=\"/wiki/&gt;%23\">&gt;%23</a></p>";
 
 
 // Sanitizer
@@ -57,17 +57,23 @@
 testWhiteList["Table security: embedded pipes 
(http://lists.wikimedia.org/mailman/htdig/wikitech-l/2006-April/022293.html)"] 
= "<table><tbody><tr><td> |<a href=\"ftp://|x||\">[1]</a>\" 
onmouseover=\"alert(document.cookie)\"&gt;test</td></tr></tbody></table>";
 
 // Sanitizer, but UTF8 in link might actually be ok in HTML5
-testWhiteList["External link containing double-single-quotes with no space 
separating the url from text in italics"] = "<p><a 
href=\"http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm\";><i>La muerte 
de Casagemas</i> (1901) en el sitio de </a><a data-mw-type=\"internal\" 
href=\"Museo Picasso (París)\">Museo Picasso</a>.</p>";
+testWhiteList["External link containing double-single-quotes with no space 
separating the url from text in italics"] = "<p><a 
href=\"http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm\"; 
data-mw-type=\"external\" data-mw-rt=\"{&quot;sourcePos&quot;:[0,146]}\"><i>La 
muerte de Casagemas</i> (1901) en el sitio de </a><a 
href=\"/wiki/Museo_Picasso_(París)\" data-mw-type=\"internal\">Museo 
Picasso</a>.</p>";
 
 // plain percent sign is also valid in HTML5
 testWhiteList["Bug 4781, 5267: %28, %29 in URL"] = "<p><a 
href=\"http://www.example.com/?title=Ben-Hur_(1959_film)\" 
data-mw-sourcePos=\"0:53\">http://www.example.com/?title=Ben-Hur_(1959_film)</a></p>";
 
-testWhiteList["External links: wiki links within external link (Bug 3695)"] = 
"<p><a href=\"http://example.com\"; data-mw-type=\"external\" 
data-mw-sourcePos=\"0:54\"></a><a data-mw-type=\"internal\" 
href=\"wikilink\">wikilink</a> embedded in ext link</p>";
+testWhiteList["External links: wiki links within external link (Bug 3695)"] = 
"<p><a href=\"http://example.com\"; data-mw-type=\"external\" 
data-mw-sourcePos=\"0:54\"></a><a data-mw-type=\"internal\" 
href=\"/wiki/Wikilink\">wikilink</a> embedded in ext link</p>";
 
 testWhiteList["Bug 4781, 5267: %25 in URL"] = "<p><a 
href=\"http://www.example.com/?title=100%_Bran\"; 
data-mw-sourcePos=\"0:41\">http://www.example.com/?title=100%_Bran</a></p>";
 
 testWhiteList["<pre> with forbidden attribute values (bug 3202)"] = "<pre 
width=\"8\" style=\"\">Narrow screen goodies</pre>";
 
+testWhiteList["Link containing % (not as a hex sequence)"] = "<p><a 
href=\"/wiki/7%_Solution\" data-mw-type=\"internal\">7% Solution</a></p>";
+
+testWhiteList["Link containing % as a single hex sequence interpreted to 
char"] = "<p><a href=\"/wiki/7%_Solution\" data-mw-type=\"internal\">7% 
Solution</a></p>";
+
+testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a 
href=\"/wiki/Lista_d''e_paise_d''o_munno\" data-mw-type=\"internal\">Lista d''e 
paise d''o munno</a></p>";
+
 if (typeof module == "object") {
        module.exports.testWhiteList = testWhiteList;
 }

Modified: trunk/extensions/VisualEditor/tests/parser/parserTests.js
===================================================================
--- trunk/extensions/VisualEditor/tests/parser/parserTests.js   2012-03-05 
11:53:33 UTC (rev 113019)
+++ trunk/extensions/VisualEditor/tests/parser/parserTests.js   2012-03-05 
12:00:38 UTC (rev 113020)
@@ -202,7 +202,7 @@
                fetchTemplates: false,
                debug: this.argv.debug,
                trace: this.argv.trace,
-               wgScriptPath: '/'
+               wgScriptPath: '/wiki/'
        });
 }
 
@@ -308,7 +308,7 @@
                return this.htmlparser.document.getElementsByTagName('body')[0]
                        .innerHTML
                        // a few things we ignore for now..
-                       .replace(/\/wiki\/Main_Page/g, 'Main Page')
+                       //.replace(/\/wiki\/Main_Page/g, 'Main Page')
                        // do not expect a toc for now
                        .replace(/<table[^>]+?id="toc"[^>]*>.+?<\/table>/mg, '')
                        // do not expect section editing for now
@@ -316,7 +316,7 @@
                        // general class and titles, typically on links
                        .replace(/(title|class|rel)="[^"]+"/g, '')
                        // strip red link markup, we do not check if a page 
exists yet
-                       
.replace(/\/index.php\?title=([^']+)&amp;action=edit&amp;redlink=1/g, '$1')
+                       
.replace(/\/index.php\?title=([^']+?)&amp;action=edit&amp;redlink=1/g, 
'/wiki/$1')
                        // the expected html has some extra space in tags, 
strip it
                        .replace(/<a +href/g, '<a href')
                        .replace(/" +>/g, '">');


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to