Stig S. Bakken wrote: >Good idea. I assume you want this for WidgetHTML.php? ;-) > > Yeah, I hopefully it would remove the need for preg_matching in there..
Will get back to this in a few days - want to clear off some other stuff... regards alan > - Stig > >On Wed, 2002-06-12 at 14:26, Alan Knowles wrote: > > >>Attached hopefully is the re2c source for a html tokenizer - I added it >>to tokenizer.c - any thoughts on inclusion? >> >>regards >>alan >> >>---- >> >> >> > > > >>enum { >> STATE_PLAIN = 0, >> STATE_TAG, >> STATE_NEXT_ARG, >> STATE_ARG, >> STATE_BEFORE_VAL, >> STATE_VAL >>}; >> >>/*!re2c >>any = [\000-\377]; >>N = (any\[<]); >>alpha = [a-zA-Z]; >>alphanumeric = [a-zA-Z0-9]; >>*/ >> >> >> >>#define YYFILL(n) goto stop >>#define YYCTYPE unsigned char >>#define YYCURSOR xp >>#define YYLIMIT end >>#define YYMARKER q >>#define STATE state >> >>PHP_FUNCTION(token_html) >>{ >> char *source = NULL; >> int argc = ZEND_NUM_ARGS(); >> int source_len; >> int state; >> char *end, *q; >> char *xp; >> char *start; >> zval *tag, *attribute; >> >> if (zend_parse_parameters(argc TSRMLS_CC, "s", &source, &source_len) == >FAILURE) >> return; >> >> YYCURSOR = source; >> YYLIMIT = source + source_len; >> STATE = STATE_PLAIN; >> >> array_init(return_value); >> switch (STATE) { >> case STATE_PLAIN: goto state_plain; >> case STATE_TAG: goto state_tag; >> case STATE_NEXT_ARG: goto state_next_arg; >> case STATE_ARG: goto state_arg; >> case STATE_BEFORE_VAL: goto state_before_val; >> case STATE_VAL: goto state_val; >> } >> >> /* >> >> I need to split the stuff into: >> array ( "TAG", array("name"=>"value","name=>"value")) >> or >> string >> >> >> add_next_index_zval(return_value, tag);handle_tag(STD_ARGS); >> */ >> >> >> >>state_plain_begin: >> STATE = STATE_PLAIN; >> >>state_plain: >> start = YYCURSOR; >>/*!re2c >> "<" { STATE = STATE_TAG; goto state_tag; } >> N+ { add_next_index_stringl(return_value, start , xp - >start , 1); goto state_plain; } >>*/ >> >>state_tag: >> start = YYCURSOR; >> >>// start -> xp contains currunt pos, >>// needs to deal with comments !-- and ?xml or php etc. >>/*!re2c >> [/!]? alphanumeric+ { MAKE_STD_ZVAL(tag); array_init(tag); >add_next_index_stringl(tag, start, xp - start, 1); goto state_next_arg_begin; } >> "!" "-" "-" { MAKE_STD_ZVAL(tag); array_init(tag); >add_next_index_stringl(tag, start, xp - start, 1); goto state_comment_begin; } >> any { add_next_index_stringl(return_value, "<",1 , 1); --YYCURSOR; >goto state_plain_begin; } >>*/ >> >> >> >>state_comment_begin: >> start = YYCURSOR; >> >>state_comment_next: >> >>/*!re2c >> "-" "-" ">" { add_next_index_stringl(tag, start, xp - start -3, 1); >add_next_index_zval(return_value, tag); goto state_plain_begin; } >> any { goto state_comment_next; } >>*/ >> >>state_next_arg_begin: >> STATE = STATE_NEXT_ARG; >> >>// at first bit after < or just after a name or name='xxxx' >>state_next_arg: >> start = YYCURSOR; >>/*!re2c >> ">" { add_next_index_zval(return_value, tag); goto state_plain_begin; } >> [ \v\t\n]+ { goto state_next_arg; } >> alpha { --YYCURSOR; STATE = STATE_ARG; goto state_arg; } >> "/" { MAKE_STD_ZVAL(attribute); array_init(attribute); >add_next_index_stringl(attribute, start, xp - start, 1);add_next_index_zval(tag, >attribute); goto state_next_arg; } >> ["] (any\["])* ["] { MAKE_STD_ZVAL(attribute); array_init(attribute); >add_next_index_stringl(attribute, start + 1, xp - start -2, 1); >add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag, attribute); >goto state_next_arg_begin; } >> ['] (any\['])* ['] { MAKE_STD_ZVAL(attribute); array_init(attribute); >add_next_index_stringl(attribute, start + 1, xp - start -2, 1); >add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag, attribute); >goto state_next_arg_begin; } >> any { add_next_index_zval(return_value, tag); goto state_plain_begin; } >>*/ >> >>state_arg: >> start = YYCURSOR; >>/*!re2c >> alpha+ { MAKE_STD_ZVAL(attribute); array_init(attribute); >add_next_index_stringl(attribute, start, xp - start, 1); STATE = STATE_BEFORE_VAL; >goto state_before_val; } >> any { --YYCURSOR; STATE = STATE_ARG; goto state_next_arg; } >>*/ >> >>state_before_val: >> start = YYCURSOR; >>/*!re2c >> [ ]* "=" [ ]* { STATE = STATE_VAL; goto state_val; } >> any { add_next_index_zval(tag, attribute); --YYCURSOR; goto >state_next_arg_begin; } >>*/ >> >> >>state_val: >> start = YYCURSOR; >>/*!re2c >> ["] (any\["])* ["] { add_next_index_stringl(attribute, start + 1, xp - start -2, >1); add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag, >attribute); goto state_next_arg_begin; } >> ['] (any\['])* ['] { add_next_index_stringl(attribute, start + 1, xp - start -2, >1); add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag, >attribute); goto state_next_arg_begin; } >> (any\[ \n>"'])+ { add_next_index_stringl(attribute, start, xp - start, 1); >add_next_index_zval(tag, attribute); goto state_next_arg_begin; } >> any { add_next_index_zval(tag, attribute); --YYCURSOR; goto >state_next_arg_begin; } >>*/ >> >>stop: >> // should do a bit of checking - adding loose attribute or tags to return >value.... >> >> >>} >> >>---- >> >> >> > > > >>-- >>PHP Development Mailing List <http://www.php.net/> >>To unsubscribe, visit: http://www.php.net/unsub.php >> >> -- PHP Development Mailing List <http://www.php.net/> To unsubscribe, visit: http://www.php.net/unsub.php