Attached hopefully is the re2c source for a html tokenizer - I added it 
to tokenizer.c - any thoughts on inclusion?

regards
alan


enum {
        STATE_PLAIN = 0,
        STATE_TAG,
        STATE_NEXT_ARG,
        STATE_ARG,
        STATE_BEFORE_VAL,
        STATE_VAL
};

/*!re2c
any = [\000-\377];
N = (any\[<]);
alpha = [a-zA-Z];
alphanumeric = [a-zA-Z0-9];
*/



#define YYFILL(n) goto stop
#define YYCTYPE unsigned char
#define YYCURSOR xp
#define YYLIMIT end
#define YYMARKER q
#define STATE state

PHP_FUNCTION(token_html)
{
        char *source = NULL;
        int argc = ZEND_NUM_ARGS();
        int source_len;
        int state;
        char *end, *q;
        char *xp;
        char *start;
        zval *tag, *attribute;
        
        if (zend_parse_parameters(argc TSRMLS_CC, "s", &source, &source_len) == 
FAILURE) 
                return;
        
        YYCURSOR = source;
        YYLIMIT = source + source_len;
        STATE = STATE_PLAIN;
        
        array_init(return_value);
        switch (STATE) {
                case STATE_PLAIN:       goto state_plain;
                case STATE_TAG:         goto state_tag;
                case STATE_NEXT_ARG:    goto state_next_arg;
                case STATE_ARG:         goto state_arg;
                case STATE_BEFORE_VAL:  goto state_before_val;
                case STATE_VAL:         goto state_val;
        }
        
        /* 
        
        I need to split the stuff into:
                array ( "TAG", array("name"=>"value","name=>"value"))
                or 
                string
        
        
        add_next_index_zval(return_value, tag);handle_tag(STD_ARGS); 
        */



state_plain_begin:
        STATE = STATE_PLAIN;
        
state_plain:
        start = YYCURSOR;
/*!re2c
  "<"                           { STATE = STATE_TAG; goto state_tag; }
  N+                            { add_next_index_stringl(return_value, start , xp - 
start  , 1); goto state_plain; }
*/

state_tag:      
        start = YYCURSOR;
        
// start -> xp contains currunt pos,    
// needs to deal with comments !-- and ?xml or php etc.
/*!re2c
  [/!]? alphanumeric+   { MAKE_STD_ZVAL(tag); array_init(tag); 
add_next_index_stringl(tag, start, xp - start, 1); goto state_next_arg_begin; }
  "!" "-" "-"          { MAKE_STD_ZVAL(tag); array_init(tag); 
add_next_index_stringl(tag, start, xp - start, 1); goto state_comment_begin; }
  any                  {  add_next_index_stringl(return_value, "<",1 , 1); --YYCURSOR; 
goto state_plain_begin; }
*/



state_comment_begin:
        start = YYCURSOR;
        
state_comment_next:        
        
/*!re2c
    "-" "-" ">"           { add_next_index_stringl(tag, start, xp - start -3, 1); 
add_next_index_zval(return_value, tag); goto state_plain_begin; }
    any                 { goto state_comment_next; }
*/

state_next_arg_begin:
        STATE = STATE_NEXT_ARG;
                
// at first bit after < or just after a name or name='xxxx'     
state_next_arg:
        start = YYCURSOR;
/*!re2c
  ">"           { add_next_index_zval(return_value, tag); goto state_plain_begin; }
  [ \v\t\n]+    { goto state_next_arg; }
  alpha         { --YYCURSOR; STATE = STATE_ARG; goto state_arg; }
  "/"           { MAKE_STD_ZVAL(attribute); array_init(attribute); 
add_next_index_stringl(attribute, start, xp - start, 1);add_next_index_zval(tag, 
attribute);  goto state_next_arg; }
  ["] (any\["])* ["]    { MAKE_STD_ZVAL(attribute); array_init(attribute); 
add_next_index_stringl(attribute, start + 1, xp - start -2, 1); 
add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag, attribute); 
goto state_next_arg_begin; }
  ['] (any\['])* [']    { MAKE_STD_ZVAL(attribute); array_init(attribute); 
add_next_index_stringl(attribute, start + 1, xp - start -2, 1); 
add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag, attribute); 
goto state_next_arg_begin; }
  any           { add_next_index_zval(return_value, tag); goto state_plain_begin; }
*/

state_arg:
        start = YYCURSOR;
/*!re2c
  alpha+        {  MAKE_STD_ZVAL(attribute); array_init(attribute); 
add_next_index_stringl(attribute, start, xp - start, 1); STATE = STATE_BEFORE_VAL; 
goto state_before_val; }
  any           { --YYCURSOR; STATE = STATE_ARG; goto state_next_arg; }
*/

state_before_val:
        start = YYCURSOR;
/*!re2c
  [ ]* "=" [ ]*         { STATE = STATE_VAL; goto state_val; }
  any                   { add_next_index_zval(tag, attribute); --YYCURSOR; goto 
state_next_arg_begin; }
*/


state_val:
        start = YYCURSOR;
/*!re2c
  ["] (any\["])* ["]    { add_next_index_stringl(attribute, start + 1, xp - start -2, 
1); add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag, 
attribute); goto state_next_arg_begin; }
  ['] (any\['])* [']    { add_next_index_stringl(attribute, start + 1, xp - start -2, 
1); add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag, attribute); 
goto state_next_arg_begin; }
  (any\[ \n>"'])+       { add_next_index_stringl(attribute, start, xp - start, 1); 
add_next_index_zval(tag, attribute); goto state_next_arg_begin;  }
  any                   { add_next_index_zval(tag, attribute); --YYCURSOR; goto 
state_next_arg_begin; }
*/

stop:
        // should do a bit of checking - adding loose attribute or tags to return 
value....
        

}

-- 
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to