Good idea.  I assume you want this for WidgetHTML.php? ;-)

 - Stig

On Wed, 2002-06-12 at 14:26, Alan Knowles wrote:
> Attached hopefully is the re2c source for a html tokenizer - I added it 
> to tokenizer.c - any thoughts on inclusion?
> 
> regards
> alan
> 
> ----
> 

> 
> enum {
>         STATE_PLAIN = 0,
>         STATE_TAG,
>         STATE_NEXT_ARG,
>         STATE_ARG,
>         STATE_BEFORE_VAL,
>         STATE_VAL
> };
> 
> /*!re2c
> any = [\000-\377];
> N = (any\[<]);
> alpha = [a-zA-Z];
> alphanumeric = [a-zA-Z0-9];
> */
> 
> 
> 
> #define YYFILL(n) goto stop
> #define YYCTYPE unsigned char
> #define YYCURSOR xp
> #define YYLIMIT end
> #define YYMARKER q
> #define STATE state
> 
> PHP_FUNCTION(token_html)
> {
>       char *source = NULL;
>       int argc = ZEND_NUM_ARGS();
>       int source_len;
>       int state;
>       char *end, *q;
>       char *xp;
>       char *start;
>       zval *tag, *attribute;
>       
>       if (zend_parse_parameters(argc TSRMLS_CC, "s", &source, &source_len) == 
>FAILURE) 
>               return;
>       
>       YYCURSOR = source;
>       YYLIMIT = source + source_len;
>       STATE = STATE_PLAIN;
>       
>       array_init(return_value);
>       switch (STATE) {
>               case STATE_PLAIN:       goto state_plain;
>               case STATE_TAG:         goto state_tag;
>               case STATE_NEXT_ARG:    goto state_next_arg;
>               case STATE_ARG:         goto state_arg;
>               case STATE_BEFORE_VAL:  goto state_before_val;
>               case STATE_VAL:         goto state_val;
>       }
>       
>       /* 
>       
>       I need to split the stuff into:
>               array ( "TAG", array("name"=>"value","name=>"value"))
>               or 
>               string
>       
>       
>       add_next_index_zval(return_value, tag);handle_tag(STD_ARGS); 
>       */
> 
> 
> 
> state_plain_begin:
>       STATE = STATE_PLAIN;
>       
> state_plain:
>       start = YYCURSOR;
> /*!re2c
>   "<"                         { STATE = STATE_TAG; goto state_tag; }
>   N+                          { add_next_index_stringl(return_value, start , xp - 
>start  , 1); goto state_plain; }
> */
> 
> state_tag:    
>       start = YYCURSOR;
>       
> // start -> xp contains currunt pos,  
> // needs to deal with comments !-- and ?xml or php etc.
> /*!re2c
>   [/!]? alphanumeric+ { MAKE_STD_ZVAL(tag); array_init(tag); 
>add_next_index_stringl(tag, start, xp - start, 1); goto state_next_arg_begin; }
>   "!" "-" "-"          { MAKE_STD_ZVAL(tag); array_init(tag); 
>add_next_index_stringl(tag, start, xp - start, 1); goto state_comment_begin; }
>   any                {  add_next_index_stringl(return_value, "<",1 , 1); --YYCURSOR; 
>goto state_plain_begin; }
> */
> 
> 
> 
> state_comment_begin:
>       start = YYCURSOR;
>         
> state_comment_next:        
>         
> /*!re2c
>     "-" "-" ">"           { add_next_index_stringl(tag, start, xp - start -3, 1); 
>add_next_index_zval(return_value, tag); goto state_plain_begin; }
>     any                 { goto state_comment_next; }
> */
> 
> state_next_arg_begin:
>       STATE = STATE_NEXT_ARG;
>               
> // at first bit after < or just after a name or name='xxxx'   
> state_next_arg:
>       start = YYCURSOR;
> /*!re2c
>   ">"         { add_next_index_zval(return_value, tag); goto state_plain_begin; }
>   [ \v\t\n]+  { goto state_next_arg; }
>   alpha               { --YYCURSOR; STATE = STATE_ARG; goto state_arg; }
>   "/"           { MAKE_STD_ZVAL(attribute); array_init(attribute); 
>add_next_index_stringl(attribute, start, xp - start, 1);add_next_index_zval(tag, 
>attribute);  goto state_next_arg; }
>   ["] (any\["])* ["]  { MAKE_STD_ZVAL(attribute); array_init(attribute); 
>add_next_index_stringl(attribute, start + 1, xp - start -2, 1); 
>add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag, attribute); 
>goto state_next_arg_begin; }
>   ['] (any\['])* [']  { MAKE_STD_ZVAL(attribute); array_init(attribute); 
>add_next_index_stringl(attribute, start + 1, xp - start -2, 1); 
>add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag, attribute); 
>goto state_next_arg_begin; }
>   any         { add_next_index_zval(return_value, tag); goto state_plain_begin; }
> */
> 
> state_arg:
>       start = YYCURSOR;
> /*!re2c
>   alpha+      {  MAKE_STD_ZVAL(attribute); array_init(attribute); 
>add_next_index_stringl(attribute, start, xp - start, 1); STATE = STATE_BEFORE_VAL; 
>goto state_before_val; }
>   any         { --YYCURSOR; STATE = STATE_ARG; goto state_next_arg; }
> */
> 
> state_before_val:
>       start = YYCURSOR;
> /*!re2c
>   [ ]* "=" [ ]*               { STATE = STATE_VAL; goto state_val; }
>   any                 { add_next_index_zval(tag, attribute); --YYCURSOR; goto 
>state_next_arg_begin; }
> */
> 
> 
> state_val:
>       start = YYCURSOR;
> /*!re2c
>   ["] (any\["])* ["]  { add_next_index_stringl(attribute, start + 1, xp - start -2, 
>1); add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag, 
>attribute); goto state_next_arg_begin; }
>   ['] (any\['])* [']  { add_next_index_stringl(attribute, start + 1, xp - start -2, 
>1); add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag, 
>attribute); goto state_next_arg_begin; }
>   (any\[ \n>"'])+     { add_next_index_stringl(attribute, start, xp - start, 1); 
>add_next_index_zval(tag, attribute); goto state_next_arg_begin;  }
>   any                 { add_next_index_zval(tag, attribute); --YYCURSOR; goto 
>state_next_arg_begin; }
> */
> 
> stop:
>       // should do a bit of checking - adding loose attribute or tags to return 
>value....
>       
> 
> }
> 
> ----
> 

> -- 
> PHP Development Mailing List <http://www.php.net/>
> To unsubscribe, visit: http://www.php.net/unsub.php
-- 
Stig Sæther Bakken, Fast Search & Transfer ASA, Trondheim, Norway
http://pear.php.net/wishlist.php/ssb


--
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to