Attached hopefully is the re2c source for a html tokenizer - I added it
to tokenizer.c - any thoughts on inclusion?
regards
alan
enum {
STATE_PLAIN = 0,
STATE_TAG,
STATE_NEXT_ARG,
STATE_ARG,
STATE_BEFORE_VAL,
STATE_VAL
};
/*!re2c
any = [\000-\377];
N = (any\[<]);
alpha = [a-zA-Z];
alphanumeric = [a-zA-Z0-9];
*/
#define YYFILL(n) goto stop
#define YYCTYPE unsigned char
#define YYCURSOR xp
#define YYLIMIT end
#define YYMARKER q
#define STATE state
PHP_FUNCTION(token_html)
{
char *source = NULL;
int argc = ZEND_NUM_ARGS();
int source_len;
int state;
char *end, *q;
char *xp;
char *start;
zval *tag, *attribute;
if (zend_parse_parameters(argc TSRMLS_CC, "s", &source, &source_len) ==
FAILURE)
return;
YYCURSOR = source;
YYLIMIT = source + source_len;
STATE = STATE_PLAIN;
array_init(return_value);
switch (STATE) {
case STATE_PLAIN: goto state_plain;
case STATE_TAG: goto state_tag;
case STATE_NEXT_ARG: goto state_next_arg;
case STATE_ARG: goto state_arg;
case STATE_BEFORE_VAL: goto state_before_val;
case STATE_VAL: goto state_val;
}
/*
I need to split the stuff into:
array ( "TAG", array("name"=>"value","name=>"value"))
or
string
add_next_index_zval(return_value, tag);handle_tag(STD_ARGS);
*/
state_plain_begin:
STATE = STATE_PLAIN;
state_plain:
start = YYCURSOR;
/*!re2c
"<" { STATE = STATE_TAG; goto state_tag; }
N+ { add_next_index_stringl(return_value, start , xp -
start , 1); goto state_plain; }
*/
state_tag:
start = YYCURSOR;
// start -> xp contains currunt pos,
// needs to deal with comments !-- and ?xml or php etc.
/*!re2c
[/!]? alphanumeric+ { MAKE_STD_ZVAL(tag); array_init(tag);
add_next_index_stringl(tag, start, xp - start, 1); goto state_next_arg_begin; }
"!" "-" "-" { MAKE_STD_ZVAL(tag); array_init(tag);
add_next_index_stringl(tag, start, xp - start, 1); goto state_comment_begin; }
any { add_next_index_stringl(return_value, "<",1 , 1); --YYCURSOR;
goto state_plain_begin; }
*/
state_comment_begin:
start = YYCURSOR;
state_comment_next:
/*!re2c
"-" "-" ">" { add_next_index_stringl(tag, start, xp - start -3, 1);
add_next_index_zval(return_value, tag); goto state_plain_begin; }
any { goto state_comment_next; }
*/
state_next_arg_begin:
STATE = STATE_NEXT_ARG;
// at first bit after < or just after a name or name='xxxx'
state_next_arg:
start = YYCURSOR;
/*!re2c
">" { add_next_index_zval(return_value, tag); goto state_plain_begin; }
[ \v\t\n]+ { goto state_next_arg; }
alpha { --YYCURSOR; STATE = STATE_ARG; goto state_arg; }
"/" { MAKE_STD_ZVAL(attribute); array_init(attribute);
add_next_index_stringl(attribute, start, xp - start, 1);add_next_index_zval(tag,
attribute); goto state_next_arg; }
["] (any\["])* ["] { MAKE_STD_ZVAL(attribute); array_init(attribute);
add_next_index_stringl(attribute, start + 1, xp - start -2, 1);
add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag, attribute);
goto state_next_arg_begin; }
['] (any\['])* ['] { MAKE_STD_ZVAL(attribute); array_init(attribute);
add_next_index_stringl(attribute, start + 1, xp - start -2, 1);
add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag, attribute);
goto state_next_arg_begin; }
any { add_next_index_zval(return_value, tag); goto state_plain_begin; }
*/
state_arg:
start = YYCURSOR;
/*!re2c
alpha+ { MAKE_STD_ZVAL(attribute); array_init(attribute);
add_next_index_stringl(attribute, start, xp - start, 1); STATE = STATE_BEFORE_VAL;
goto state_before_val; }
any { --YYCURSOR; STATE = STATE_ARG; goto state_next_arg; }
*/
state_before_val:
start = YYCURSOR;
/*!re2c
[ ]* "=" [ ]* { STATE = STATE_VAL; goto state_val; }
any { add_next_index_zval(tag, attribute); --YYCURSOR; goto
state_next_arg_begin; }
*/
state_val:
start = YYCURSOR;
/*!re2c
["] (any\["])* ["] { add_next_index_stringl(attribute, start + 1, xp - start -2,
1); add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag,
attribute); goto state_next_arg_begin; }
['] (any\['])* ['] { add_next_index_stringl(attribute, start + 1, xp - start -2,
1); add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag, attribute);
goto state_next_arg_begin; }
(any\[ \n>"'])+ { add_next_index_stringl(attribute, start, xp - start, 1);
add_next_index_zval(tag, attribute); goto state_next_arg_begin; }
any { add_next_index_zval(tag, attribute); --YYCURSOR; goto
state_next_arg_begin; }
*/
stop:
// should do a bit of checking - adding loose attribute or tags to return
value....
}
--
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, visit: http://www.php.net/unsub.php