Good idea. I assume you want this for WidgetHTML.php? ;-) - Stig
On Wed, 2002-06-12 at 14:26, Alan Knowles wrote: > Attached hopefully is the re2c source for a html tokenizer - I added it > to tokenizer.c - any thoughts on inclusion? > > regards > alan > > ---- > > > enum { > STATE_PLAIN = 0, > STATE_TAG, > STATE_NEXT_ARG, > STATE_ARG, > STATE_BEFORE_VAL, > STATE_VAL > }; > > /*!re2c > any = [\000-\377]; > N = (any\[<]); > alpha = [a-zA-Z]; > alphanumeric = [a-zA-Z0-9]; > */ > > > > #define YYFILL(n) goto stop > #define YYCTYPE unsigned char > #define YYCURSOR xp > #define YYLIMIT end > #define YYMARKER q > #define STATE state > > PHP_FUNCTION(token_html) > { > char *source = NULL; > int argc = ZEND_NUM_ARGS(); > int source_len; > int state; > char *end, *q; > char *xp; > char *start; > zval *tag, *attribute; > > if (zend_parse_parameters(argc TSRMLS_CC, "s", &source, &source_len) == >FAILURE) > return; > > YYCURSOR = source; > YYLIMIT = source + source_len; > STATE = STATE_PLAIN; > > array_init(return_value); > switch (STATE) { > case STATE_PLAIN: goto state_plain; > case STATE_TAG: goto state_tag; > case STATE_NEXT_ARG: goto state_next_arg; > case STATE_ARG: goto state_arg; > case STATE_BEFORE_VAL: goto state_before_val; > case STATE_VAL: goto state_val; > } > > /* > > I need to split the stuff into: > array ( "TAG", array("name"=>"value","name=>"value")) > or > string > > > add_next_index_zval(return_value, tag);handle_tag(STD_ARGS); > */ > > > > state_plain_begin: > STATE = STATE_PLAIN; > > state_plain: > start = YYCURSOR; > /*!re2c > "<" { STATE = STATE_TAG; goto state_tag; } > N+ { add_next_index_stringl(return_value, start , xp - >start , 1); goto state_plain; } > */ > > state_tag: > start = YYCURSOR; > > // start -> xp contains currunt pos, > // needs to deal with comments !-- and ?xml or php etc. > /*!re2c > [/!]? alphanumeric+ { MAKE_STD_ZVAL(tag); array_init(tag); >add_next_index_stringl(tag, start, xp - start, 1); goto state_next_arg_begin; } > "!" "-" "-" { MAKE_STD_ZVAL(tag); array_init(tag); >add_next_index_stringl(tag, start, xp - start, 1); goto state_comment_begin; } > any { add_next_index_stringl(return_value, "<",1 , 1); --YYCURSOR; >goto state_plain_begin; } > */ > > > > state_comment_begin: > start = YYCURSOR; > > state_comment_next: > > /*!re2c > "-" "-" ">" { add_next_index_stringl(tag, start, xp - start -3, 1); >add_next_index_zval(return_value, tag); goto state_plain_begin; } > any { goto state_comment_next; } > */ > > state_next_arg_begin: > STATE = STATE_NEXT_ARG; > > // at first bit after < or just after a name or name='xxxx' > state_next_arg: > start = YYCURSOR; > /*!re2c > ">" { add_next_index_zval(return_value, tag); goto state_plain_begin; } > [ \v\t\n]+ { goto state_next_arg; } > alpha { --YYCURSOR; STATE = STATE_ARG; goto state_arg; } > "/" { MAKE_STD_ZVAL(attribute); array_init(attribute); >add_next_index_stringl(attribute, start, xp - start, 1);add_next_index_zval(tag, >attribute); goto state_next_arg; } > ["] (any\["])* ["] { MAKE_STD_ZVAL(attribute); array_init(attribute); >add_next_index_stringl(attribute, start + 1, xp - start -2, 1); >add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag, attribute); >goto state_next_arg_begin; } > ['] (any\['])* ['] { MAKE_STD_ZVAL(attribute); array_init(attribute); >add_next_index_stringl(attribute, start + 1, xp - start -2, 1); >add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag, attribute); >goto state_next_arg_begin; } > any { add_next_index_zval(return_value, tag); goto state_plain_begin; } > */ > > state_arg: > start = YYCURSOR; > /*!re2c > alpha+ { MAKE_STD_ZVAL(attribute); array_init(attribute); >add_next_index_stringl(attribute, start, xp - start, 1); STATE = STATE_BEFORE_VAL; >goto state_before_val; } > any { --YYCURSOR; STATE = STATE_ARG; goto state_next_arg; } > */ > > state_before_val: > start = YYCURSOR; > /*!re2c > [ ]* "=" [ ]* { STATE = STATE_VAL; goto state_val; } > any { add_next_index_zval(tag, attribute); --YYCURSOR; goto >state_next_arg_begin; } > */ > > > state_val: > start = YYCURSOR; > /*!re2c > ["] (any\["])* ["] { add_next_index_stringl(attribute, start + 1, xp - start -2, >1); add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag, >attribute); goto state_next_arg_begin; } > ['] (any\['])* ['] { add_next_index_stringl(attribute, start + 1, xp - start -2, >1); add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag, >attribute); goto state_next_arg_begin; } > (any\[ \n>"'])+ { add_next_index_stringl(attribute, start, xp - start, 1); >add_next_index_zval(tag, attribute); goto state_next_arg_begin; } > any { add_next_index_zval(tag, attribute); --YYCURSOR; goto >state_next_arg_begin; } > */ > > stop: > // should do a bit of checking - adding loose attribute or tags to return >value.... > > > } > > ---- > > -- > PHP Development Mailing List <http://www.php.net/> > To unsubscribe, visit: http://www.php.net/unsub.php -- Stig Sæther Bakken, Fast Search & Transfer ASA, Trondheim, Norway http://pear.php.net/wishlist.php/ssb -- PHP Development Mailing List <http://www.php.net/> To unsubscribe, visit: http://www.php.net/unsub.php