Attached hopefully is the re2c source for a html tokenizer - I added it to tokenizer.c - any thoughts on inclusion?
regards alan
enum { STATE_PLAIN = 0, STATE_TAG, STATE_NEXT_ARG, STATE_ARG, STATE_BEFORE_VAL, STATE_VAL }; /*!re2c any = [\000-\377]; N = (any\[<]); alpha = [a-zA-Z]; alphanumeric = [a-zA-Z0-9]; */ #define YYFILL(n) goto stop #define YYCTYPE unsigned char #define YYCURSOR xp #define YYLIMIT end #define YYMARKER q #define STATE state PHP_FUNCTION(token_html) { char *source = NULL; int argc = ZEND_NUM_ARGS(); int source_len; int state; char *end, *q; char *xp; char *start; zval *tag, *attribute; if (zend_parse_parameters(argc TSRMLS_CC, "s", &source, &source_len) == FAILURE) return; YYCURSOR = source; YYLIMIT = source + source_len; STATE = STATE_PLAIN; array_init(return_value); switch (STATE) { case STATE_PLAIN: goto state_plain; case STATE_TAG: goto state_tag; case STATE_NEXT_ARG: goto state_next_arg; case STATE_ARG: goto state_arg; case STATE_BEFORE_VAL: goto state_before_val; case STATE_VAL: goto state_val; } /* I need to split the stuff into: array ( "TAG", array("name"=>"value","name=>"value")) or string add_next_index_zval(return_value, tag);handle_tag(STD_ARGS); */ state_plain_begin: STATE = STATE_PLAIN; state_plain: start = YYCURSOR; /*!re2c "<" { STATE = STATE_TAG; goto state_tag; } N+ { add_next_index_stringl(return_value, start , xp - start , 1); goto state_plain; } */ state_tag: start = YYCURSOR; // start -> xp contains currunt pos, // needs to deal with comments !-- and ?xml or php etc. /*!re2c [/!]? alphanumeric+ { MAKE_STD_ZVAL(tag); array_init(tag); add_next_index_stringl(tag, start, xp - start, 1); goto state_next_arg_begin; } "!" "-" "-" { MAKE_STD_ZVAL(tag); array_init(tag); add_next_index_stringl(tag, start, xp - start, 1); goto state_comment_begin; } any { add_next_index_stringl(return_value, "<",1 , 1); --YYCURSOR; goto state_plain_begin; } */ state_comment_begin: start = YYCURSOR; state_comment_next: /*!re2c "-" "-" ">" { add_next_index_stringl(tag, start, xp - start -3, 1); add_next_index_zval(return_value, tag); goto state_plain_begin; } any { goto state_comment_next; } */ state_next_arg_begin: STATE = STATE_NEXT_ARG; // at first bit after < or just after a name or name='xxxx' state_next_arg: start = YYCURSOR; /*!re2c ">" { add_next_index_zval(return_value, tag); goto state_plain_begin; } [ \v\t\n]+ { goto state_next_arg; } alpha { --YYCURSOR; STATE = STATE_ARG; goto state_arg; } "/" { MAKE_STD_ZVAL(attribute); array_init(attribute); add_next_index_stringl(attribute, start, xp - start, 1);add_next_index_zval(tag, attribute); goto state_next_arg; } ["] (any\["])* ["] { MAKE_STD_ZVAL(attribute); array_init(attribute); add_next_index_stringl(attribute, start + 1, xp - start -2, 1); add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag, attribute); goto state_next_arg_begin; } ['] (any\['])* ['] { MAKE_STD_ZVAL(attribute); array_init(attribute); add_next_index_stringl(attribute, start + 1, xp - start -2, 1); add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag, attribute); goto state_next_arg_begin; } any { add_next_index_zval(return_value, tag); goto state_plain_begin; } */ state_arg: start = YYCURSOR; /*!re2c alpha+ { MAKE_STD_ZVAL(attribute); array_init(attribute); add_next_index_stringl(attribute, start, xp - start, 1); STATE = STATE_BEFORE_VAL; goto state_before_val; } any { --YYCURSOR; STATE = STATE_ARG; goto state_next_arg; } */ state_before_val: start = YYCURSOR; /*!re2c [ ]* "=" [ ]* { STATE = STATE_VAL; goto state_val; } any { add_next_index_zval(tag, attribute); --YYCURSOR; goto state_next_arg_begin; } */ state_val: start = YYCURSOR; /*!re2c ["] (any\["])* ["] { add_next_index_stringl(attribute, start + 1, xp - start -2, 1); add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag, attribute); goto state_next_arg_begin; } ['] (any\['])* ['] { add_next_index_stringl(attribute, start + 1, xp - start -2, 1); add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag, attribute); goto state_next_arg_begin; } (any\[ \n>"'])+ { add_next_index_stringl(attribute, start, xp - start, 1); add_next_index_zval(tag, attribute); goto state_next_arg_begin; } any { add_next_index_zval(tag, attribute); --YYCURSOR; goto state_next_arg_begin; } */ stop: // should do a bit of checking - adding loose attribute or tags to return value.... }
-- PHP Development Mailing List <http://www.php.net/> To unsubscribe, visit: http://www.php.net/unsub.php