http://www.mediawiki.org/wiki/Special:Code/MediaWiki/97542
Revision: 97542
Author: maxsem
Date: 2011-09-19 19:33:16 +0000 (Mon, 19 Sep 2011)
Log Message:
-----------
Rewrote tokenizer so that it can actually be understood by mortal humans. All
previous tokenizer's glitches are kept intact FOR NOW, except for:
* Slightly different handling of invalid phonemes/hieroglyphs. Garbage in,
garbage out.
* Normalised handling of "!" and "." - now they are handled uniformly no matter
if they're separated from other tokens or not, e.g. "a!b" === "a ! b".
* Removed the attempt to handle parenthetical grouping pending a proper fix as
requested by bug 31000.
Modified Paths:
--------------
trunk/extensions/wikihiero/tests.txt
trunk/extensions/wikihiero/wikihiero.body.php
Modified: trunk/extensions/wikihiero/tests.txt
===================================================================
--- trunk/extensions/wikihiero/tests.txt 2011-09-19 19:20:03 UTC (rev
97541)
+++ trunk/extensions/wikihiero/tests.txt 2011-09-19 19:33:16 UTC (rev
97542)
@@ -12,7 +12,7 @@
!! input
<hiero><script>alert("FAIL")</script></hiero>
!! result
-<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table
class="mw-hiero-table"><tr> <td><script>alert(FAIL)<script></td>
</tr></table> </td></tr></table>
+<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table
class="mw-hiero-table"><tr>
<td><script>alert("FAIL")</script></td> </tr></table>
</td></tr></table>
!! end
@@ -44,6 +44,15 @@
!! end
!! test
+WikiHiero - EOL
+!! input
+<hiero>A1!B1 ! C1</hiero>
+!! result
+<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table
class="mw-hiero-table"><tr> <td><img style='margin:1px;' height='38'
src='/extensions/wikihiero/img/hiero_A1.png' title='A1' alt='A1'
/></td></tr></table><table class="mw-hiero-table"><tr> <td><img
style='margin:1px;' height='38' src='/extensions/wikihiero/img/hiero_B1.png'
title='B1' alt='B1' /></td></tr></table><table class="mw-hiero-table"><tr>
<td><img style='margin:1px;' height='38'
src='/extensions/wikihiero/img/hiero_C1.png' title='C1' alt='C1'
/></td></tr></table> </td></tr></table>
+
+!! end
+
+!! test
WikiHiero - complex text with EOL
!! input
<hiero>M23-X1:R4-X8-Q2:D4-W17-R14-G4-R8-O29:V30-U23 !
@@ -61,3 +70,14 @@
<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table
class="mw-hiero-table"><tr> <td><img class="mw-mirrored" style='margin:1px;'
height='38' src='/extensions/wikihiero/img/hiero_A1.png' title='A1' alt='A1'
/></td><td><img style='margin:1px;' height='38'
src='/extensions/wikihiero/img/hiero_A1.png' title='A1' alt='A1'
/></td></tr></table> </td></tr></table>
!! end
+
+!! test
+WikiHiero - void blocks
+!! input
+<hiero>A1..B1.C1</hiero>
+<hiero>A1 .. B1 . C1</hiero>
+!!result
+<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table
class="mw-hiero-table"><tr> <td><img style='margin:1px;' height='38'
src='/extensions/wikihiero/img/hiero_A1.png' title='A1' alt='A1'
/></td><td><table class="mw-hiero-table" style="width:
44px;"><tr><td> </td></tr></table></td><td><img style='margin:1px;'
height='38' src='/extensions/wikihiero/img/hiero_B1.png' title='B1' alt='B1'
/></td><td><table class="mw-hiero-table" style="width:
22px;"><tr><td> </td></tr></table></td><td><img style='margin:1px;'
height='38' src='/extensions/wikihiero/img/hiero_C1.png' title='C1' alt='C1'
/></td></tr></table> </td></tr></table>
+<table class='mw-hiero-table mw-hiero-outer' dir='ltr'><tr><td> <table
class="mw-hiero-table"><tr> <td><img style='margin:1px;' height='38'
src='/extensions/wikihiero/img/hiero_A1.png' title='A1' alt='A1'
/></td><td><table class="mw-hiero-table" style="width:
44px;"><tr><td> </td></tr></table></td><td><img style='margin:1px;'
height='38' src='/extensions/wikihiero/img/hiero_B1.png' title='B1' alt='B1'
/></td><td><table class="mw-hiero-table" style="width:
22px;"><tr><td> </td></tr></table></td><td><img style='margin:1px;'
height='38' src='/extensions/wikihiero/img/hiero_C1.png' title='C1' alt='C1'
/></td></tr></table> </td></tr></table>
+
+!! end
Modified: trunk/extensions/wikihiero/wikihiero.body.php
===================================================================
--- trunk/extensions/wikihiero/wikihiero.body.php 2011-09-19 19:20:03 UTC
(rev 97541)
+++ trunk/extensions/wikihiero/wikihiero.body.php 2011-09-19 19:33:16 UTC
(rev 97542)
@@ -430,16 +430,14 @@
* Hieroglyphs tokenizer class
*/
/*private*/ class HieroTokenizer {
- const TYPE_NONE = 0;
- const TYPE_GLYPH = 1; // rendered items
- const TYPE_CODE = 2; // single code as ':', '*', '!', '(' or ')'
- const TYPE_SPECIAL = 3; // advanced code (more than 1 caracter)
- const TYPE_END = 4; // end of line '!'
+ private static $delimiters = false;
+ private static $tokenDelimiters;
+ private static $singleChars;
private $text;
private $blocks = false;
- private $blocks_id = 0;
- private $item_id = 0;
+ private $currentBlock;
+ private $token;
/**
* Constructor
@@ -448,8 +446,19 @@
*/
public function __construct( $text ) {
$this->text = $text;
+ self::initStatic();
}
+ private static function initStatic() {
+ if ( self::$delimiters ) {
+ return;
+ }
+
+ self::$delimiters = array_flip( array( ' ', '-', "\t", "\n" ) );
+ self::$tokenDelimiters = array_flip( array( '*', ':', '(', ')'
) );
+ self::$singleChars = array_flip( array( '!' ) );
+ }
+
/**
* Split text into blocks, then split blocks into items
*
@@ -459,72 +468,93 @@
if ( $this->blocks !== false ) {
return $this->blocks;
}
- $this->blocks = array( array( '' ) );
- $parentheses = 0;
- $type = self::TYPE_NONE;
+ $this->blocks = array();
+ $this->currentBlock = array();
+ $this->token = '';
+
+ $text = preg_replace( '/<!--.*?-->/', '', $this->text ); //
remove HTML comments
- for ( $i = 0; $i < strlen( $this->text ); $i++ ) {
+ for ( $i = 0; $i < strlen( $text ); $i++ ) {
$char = $this->text[$i];
- if ( $char == '(' ) {
- $parentheses++;
- } elseif ( $char == ')' ) {
- $parentheses--;
+ if ( isset( self::$delimiters[$char] ) ) {
+ $this->newBlock();
+ } elseif ( isset( self::$singleChars[$char] ) ) {
+ $this->singleCharBlock( $char );
+ } elseif ( $char == '.' ) {
+ $this->dot();
+ } elseif ( isset( self::$tokenDelimiters[$char] ) ) {
+ $this->newToken( $char );
+ } else {
+ $this->char( $char );
}
+ }
- if ( $parentheses == 0 ) {
- if ( $char == '-' || $char == ' ' ) {
- if ( $type != self::TYPE_NONE ) {
- $this->addBlock( '' );
- $type = self::TYPE_NONE;
- }
- }
- } else {// don't split block if inside parentheses
- if ( $char == '-' ) {
- $this->addItem( '-' );
- $type = self::TYPE_CODE;
- }
- }
+ $this->newBlock(); // flush stuff being processed
- if ( $char == '!' ) {
- if ( $this->item_id > 0 ) {
- $this->addBlock();
- }
- $this->blocks[$this->blocks_id][$this->item_id]
= $char;
- $type = self::TYPE_END;
+ return $this->blocks;
+ }
- } elseif ( preg_match( '/[*:()]/', $char ) ) {
- if ( $type == self::TYPE_GLYPH || $type ==
self::TYPE_CODE ) {
- $this->addItem( '' );
- }
- $this->blocks[$this->blocks_id][$this->item_id]
= $char;
- $type = self::TYPE_CODE;
+ /**
+ * Handles a block delimiter
+ */
+ private function newBlock() {
+ $this->newToken();
+ if( $this->currentBlock ) {
+ $this->blocks[] = $this->currentBlock;
+ $this->currentBlock = array();
+ }
+ }
- } elseif ( ctype_alnum( $char ) || $char == '.' ||
$char == '<'
- || $char == '>' || $char == '\\' ) {
- if ( $type == self::TYPE_END ) {
- $this->addBlock( '' );
- } elseif ( $type == self::TYPE_CODE ) {
- $this->addItem( '' );
- }
- $this->blocks[$this->blocks_id][$this->item_id]
.= $char;
- $type = self::TYPE_GLYPH;
- }
+ /**
+ * Flushes current token, optionally adds another one
+ *
+ * @param $token Mixed: token to add or false
+ */
+ private function newToken( $token = false ) {
+ if ( $this->token !== '' ) {
+ $this->currentBlock[] = $this->token;
+ $this->token = '';
}
- return $this->blocks;
+ if ( $token !== false ) {
+ $this->currentBlock[] = $token;
+ }
}
- private function addBlock( $newItem = false ) {
- $this->blocks_id++;
- $this->blocks[$this->blocks_id] = array();
- $this->item_id = 0;
- if ( $newItem !== false ) {
- $this->blocks[$this->blocks_id][$this->item_id] =
$newItem;
+ /**
+ * Adds a block consisting of one character
+ *
+ * @param $char string: block character
+ */
+ private function singleCharBlock( $char ) {
+ $this->newBlock();
+ $this->blocks[] = array( $char );
+ }
+
+ /**
+ * Handles void blocks represented by dots
+ */
+ private function dot() {
+ if ( $this->token == '.' ) {
+ $this->token = '..';
+ $this->newBlock();
+ } else {
+ $this->newBlock();
+ $this->token = '.';
}
}
- private function addItem( $item ) {
- $this->item_id++;
- $this->blocks[$this->blocks_id][$this->item_id] = $item;
+ /**
+ * Adds a miscellaneous character to current token
+ *
+ * @param $char string: character to add
+ */
+ private function char( $char ) {
+ if ( $this->token == '.' ) {
+ $this->newBlock();
+ $this->token = $char;
+ } else {
+ $this->token .= $char;
+ }
}
-}
\ No newline at end of file
+}
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs