http://www.mediawiki.org/wiki/Special:Code/MediaWiki/97542

Revision: 97542
Author:   maxsem
Date:     2011-09-19 19:33:16 +0000 (Mon, 19 Sep 2011)
Log Message:
-----------
Rewrote tokenizer so that it can actually be understood by mortal humans. All 
previous tokenizer's glitches are kept intact FOR NOW, except for:
* Slightly different handling of invalid phonemes/hieroglyphs. Garbage in, 
garbage out.
* Normalised handling of "!" and "." - now they are handled uniformly no matter 
if they're separated from other tokens or not, e.g. "a!b" === "a ! b".
* Removed the attempt to handle parenthetical grouping pending a proper fix as 
requested by bug 31000.

Modified Paths:
--------------
    trunk/extensions/wikihiero/tests.txt
    trunk/extensions/wikihiero/wikihiero.body.php

Modified: trunk/extensions/wikihiero/tests.txt
===================================================================
--- trunk/extensions/wikihiero/tests.txt        2011-09-19 19:20:03 UTC (rev 
97541)
+++ trunk/extensions/wikihiero/tests.txt        2011-09-19 19:33:16 UTC (rev 
97542)
@@ -12,7 +12,7 @@
 !! input
 <hiero><script>alert("FAIL")</script></hiero>
 !! result
-<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table 
class="mw-hiero-table"><tr> <td>&lt;script&gt;alert(FAIL)&lt;script&gt;</td> 
</tr></table> </td></tr></table>
+<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table 
class="mw-hiero-table"><tr> 
<td>&lt;script&gt;alert(&quot;FAIL&quot;)&lt;/script&gt;</td> </tr></table> 
</td></tr></table>
 
 !! end
 
@@ -44,6 +44,15 @@
 !! end
 
 !! test
+WikiHiero - EOL
+!! input
+<hiero>A1!B1 ! C1</hiero>
+!! result
+<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table 
class="mw-hiero-table"><tr> <td><img style='margin:1px;' height='38' 
src='/extensions/wikihiero/img/hiero_A1.png' title='A1' alt='A1' 
/></td></tr></table><table class="mw-hiero-table"><tr> <td><img 
style='margin:1px;' height='38' src='/extensions/wikihiero/img/hiero_B1.png' 
title='B1' alt='B1' /></td></tr></table><table class="mw-hiero-table"><tr> 
<td><img style='margin:1px;' height='38' 
src='/extensions/wikihiero/img/hiero_C1.png' title='C1' alt='C1' 
/></td></tr></table> </td></tr></table>
+
+!! end
+
+!! test
 WikiHiero - complex text with EOL
 !! input
 <hiero>M23-X1:R4-X8-Q2:D4-W17-R14-G4-R8-O29:V30-U23 !
@@ -61,3 +70,14 @@
 <table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table 
class="mw-hiero-table"><tr> <td><img class="mw-mirrored" style='margin:1px;' 
height='38' src='/extensions/wikihiero/img/hiero_A1.png' title='A1' alt='A1' 
/></td><td><img style='margin:1px;' height='38' 
src='/extensions/wikihiero/img/hiero_A1.png' title='A1' alt='A1' 
/></td></tr></table> </td></tr></table>
 
 !! end
+
+!! test
+WikiHiero - void blocks
+!! input
+<hiero>A1..B1.C1</hiero>
+<hiero>A1 .. B1 . C1</hiero>
+!!result
+<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table 
class="mw-hiero-table"><tr> <td><img style='margin:1px;' height='38' 
src='/extensions/wikihiero/img/hiero_A1.png' title='A1' alt='A1' 
/></td><td><table class="mw-hiero-table" style="width: 
44px;"><tr><td>&#160;</td></tr></table></td><td><img style='margin:1px;' 
height='38' src='/extensions/wikihiero/img/hiero_B1.png' title='B1' alt='B1' 
/></td><td><table class="mw-hiero-table" style="width: 
22px;"><tr><td>&#160;</td></tr></table></td><td><img style='margin:1px;' 
height='38' src='/extensions/wikihiero/img/hiero_C1.png' title='C1' alt='C1' 
/></td></tr></table> </td></tr></table>
+<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table 
class="mw-hiero-table"><tr> <td><img style='margin:1px;' height='38' 
src='/extensions/wikihiero/img/hiero_A1.png' title='A1' alt='A1' 
/></td><td><table class="mw-hiero-table" style="width: 
44px;"><tr><td>&#160;</td></tr></table></td><td><img style='margin:1px;' 
height='38' src='/extensions/wikihiero/img/hiero_B1.png' title='B1' alt='B1' 
/></td><td><table class="mw-hiero-table" style="width: 
22px;"><tr><td>&#160;</td></tr></table></td><td><img style='margin:1px;' 
height='38' src='/extensions/wikihiero/img/hiero_C1.png' title='C1' alt='C1' 
/></td></tr></table> </td></tr></table>
+
+!! end

Modified: trunk/extensions/wikihiero/wikihiero.body.php
===================================================================
--- trunk/extensions/wikihiero/wikihiero.body.php       2011-09-19 19:20:03 UTC 
(rev 97541)
+++ trunk/extensions/wikihiero/wikihiero.body.php       2011-09-19 19:33:16 UTC 
(rev 97542)
@@ -430,16 +430,14 @@
  * Hieroglyphs tokenizer class
  */
 /*private*/ class HieroTokenizer {
-       const TYPE_NONE    = 0;
-       const TYPE_GLYPH   = 1;    // rendered items
-       const TYPE_CODE    = 2;    // single code as ':', '*', '!', '(' or ')'
-       const TYPE_SPECIAL = 3;    // advanced code (more than 1 caracter)
-       const TYPE_END     = 4;    // end of line '!'
+       private static $delimiters = false;
+       private static $tokenDelimiters;
+       private static $singleChars;
 
        private $text;
        private $blocks = false;
-       private $blocks_id = 0;
-       private $item_id = 0;
+       private $currentBlock;
+       private $token;
 
        /**
         * Constructor
@@ -448,8 +446,19 @@
         */
        public function __construct( $text ) {
                $this->text = $text;
+               self::initStatic();
        }
 
+       private static function initStatic() {
+               if ( self::$delimiters ) {
+                       return;
+               }
+
+               self::$delimiters = array_flip( array( ' ', '-', "\t", "\n" ) );
+               self::$tokenDelimiters = array_flip( array( '*', ':', '(', ')' 
) );
+               self::$singleChars = array_flip( array( '!' ) );
+       }
+
        /**
         * Split text into blocks, then split blocks into items
         * 
@@ -459,72 +468,93 @@
                if ( $this->blocks !== false ) {
                        return $this->blocks;
                }
-               $this->blocks = array( array( '' ) );
-               $parentheses = 0;
-               $type = self::TYPE_NONE;
+               $this->blocks = array();
+               $this->currentBlock = array();
+               $this->token = '';
+               
+               $text = preg_replace( '/<!--.*?-->/', '', $this->text ); // 
remove HTML comments
 
-               for ( $i = 0; $i < strlen( $this->text ); $i++ ) {
+               for ( $i = 0; $i < strlen( $text ); $i++ ) {
                        $char = $this->text[$i];
 
-                       if ( $char == '(' ) {
-                               $parentheses++;
-                       } elseif ( $char == ')' ) {
-                               $parentheses--;
+                       if ( isset( self::$delimiters[$char] ) ) {
+                               $this->newBlock();
+                       } elseif ( isset( self::$singleChars[$char] ) ) {
+                               $this->singleCharBlock( $char );
+                       } elseif ( $char == '.' ) {
+                               $this->dot();
+                       } elseif ( isset( self::$tokenDelimiters[$char] ) ) {
+                               $this->newToken( $char );
+                       } else {
+                               $this->char( $char );
                        }
+               }
 
-                       if ( $parentheses == 0 ) {
-                               if ( $char == '-' || $char == ' ' ) {
-                                       if ( $type != self::TYPE_NONE ) {
-                                               $this->addBlock( '' );
-                                               $type = self::TYPE_NONE;
-                                       }
-                               }
-                       } else {// don't split block if inside parentheses
-                               if ( $char == '-' ) {
-                                       $this->addItem( '-' );
-                                       $type = self::TYPE_CODE;
-                               }
-                       }
+               $this->newBlock(); // flush stuff being processed
 
-                       if ( $char == '!' ) {
-                               if ( $this->item_id > 0 ) {
-                                       $this->addBlock();
-                               }
-                               $this->blocks[$this->blocks_id][$this->item_id] 
= $char;
-                               $type = self::TYPE_END;
+               return $this->blocks;
+       }
 
-                       } elseif ( preg_match( '/[*:()]/', $char ) ) {
-                               if ( $type == self::TYPE_GLYPH || $type == 
self::TYPE_CODE ) {
-                                       $this->addItem( '' );
-                               }
-                               $this->blocks[$this->blocks_id][$this->item_id] 
= $char;
-                               $type = self::TYPE_CODE;
+       /**
+        * Handles a block delimiter
+        */
+       private function newBlock() {
+               $this->newToken();
+               if( $this->currentBlock ) {
+                       $this->blocks[] = $this->currentBlock;
+                       $this->currentBlock = array();
+               }
+       }
 
-                       } elseif ( ctype_alnum( $char ) || $char == '.' || 
$char == '<'
-                               || $char == '>' || $char == '\\' ) {
-                               if ( $type == self::TYPE_END ) {
-                                       $this->addBlock( '' );
-                               } elseif ( $type == self::TYPE_CODE ) {
-                                       $this->addItem( '' );
-                               }
-                               $this->blocks[$this->blocks_id][$this->item_id] 
.= $char;
-                               $type = self::TYPE_GLYPH;
-                       }
+       /**
+        * Flushes current token, optionally adds another one
+        *
+        * @param $token Mixed: token to add or false
+        */
+       private function newToken( $token = false ) {
+               if ( $this->token !== '' ) {
+                       $this->currentBlock[] = $this->token;
+                       $this->token = '';
                }
-               return $this->blocks;
+               if ( $token !== false ) {
+                       $this->currentBlock[] = $token;
+               }
        }
 
-       private function addBlock( $newItem = false ) {
-               $this->blocks_id++;
-               $this->blocks[$this->blocks_id] = array();
-               $this->item_id = 0;
-               if ( $newItem !== false ) {
-                       $this->blocks[$this->blocks_id][$this->item_id] = 
$newItem;
+       /**
+        * Adds a block consisting of one character
+        *
+        * @param $char string: block character
+        */
+       private function singleCharBlock( $char ) {
+               $this->newBlock();
+               $this->blocks[] = array( $char );
+       }
+
+       /**
+        * Handles void blocks represented by dots
+        */
+       private function dot() {
+               if ( $this->token == '.' ) {
+                       $this->token = '..';
+                       $this->newBlock();
+               } else {
+                       $this->newBlock();
+                       $this->token = '.';
                }
        }
 
-       private function addItem( $item ) {
-               $this->item_id++;
-               $this->blocks[$this->blocks_id][$this->item_id] = $item;
+       /**
+        * Adds a miscellaneous character to current token
+        *
+        * @param $char string: character to add
+        */
+       private function char( $char ) {
+               if ( $this->token == '.' ) {
+                       $this->newBlock();
+                       $this->token = $char;
+               } else {
+                       $this->token .= $char;
+               }
        }
-}
\ No newline at end of file
+}


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to