Stig S. Bakken wrote:
>Good idea. I assume you want this for WidgetHTML.php? ;-)
>
>
Yeah, I hopefully it would remove the need for preg_matching in there..
Will get back to this in a few days - want to clear off some other stuff...
regards
alan
> - Stig
>
>On Wed, 2002-06-12 at 14:26, Alan Knowles wrote:
>
>
>>Attached hopefully is the re2c source for a html tokenizer - I added it
>>to tokenizer.c - any thoughts on inclusion?
>>
>>regards
>>alan
>>
>>----
>>
>>
>>
>
>
>
>>enum {
>> STATE_PLAIN = 0,
>> STATE_TAG,
>> STATE_NEXT_ARG,
>> STATE_ARG,
>> STATE_BEFORE_VAL,
>> STATE_VAL
>>};
>>
>>/*!re2c
>>any = [\000-\377];
>>N = (any\[<]);
>>alpha = [a-zA-Z];
>>alphanumeric = [a-zA-Z0-9];
>>*/
>>
>>
>>
>>#define YYFILL(n) goto stop
>>#define YYCTYPE unsigned char
>>#define YYCURSOR xp
>>#define YYLIMIT end
>>#define YYMARKER q
>>#define STATE state
>>
>>PHP_FUNCTION(token_html)
>>{
>> char *source = NULL;
>> int argc = ZEND_NUM_ARGS();
>> int source_len;
>> int state;
>> char *end, *q;
>> char *xp;
>> char *start;
>> zval *tag, *attribute;
>>
>> if (zend_parse_parameters(argc TSRMLS_CC, "s", &source, &source_len) ==
>FAILURE)
>> return;
>>
>> YYCURSOR = source;
>> YYLIMIT = source + source_len;
>> STATE = STATE_PLAIN;
>>
>> array_init(return_value);
>> switch (STATE) {
>> case STATE_PLAIN: goto state_plain;
>> case STATE_TAG: goto state_tag;
>> case STATE_NEXT_ARG: goto state_next_arg;
>> case STATE_ARG: goto state_arg;
>> case STATE_BEFORE_VAL: goto state_before_val;
>> case STATE_VAL: goto state_val;
>> }
>>
>> /*
>>
>> I need to split the stuff into:
>> array ( "TAG", array("name"=>"value","name=>"value"))
>> or
>> string
>>
>>
>> add_next_index_zval(return_value, tag);handle_tag(STD_ARGS);
>> */
>>
>>
>>
>>state_plain_begin:
>> STATE = STATE_PLAIN;
>>
>>state_plain:
>> start = YYCURSOR;
>>/*!re2c
>> "<" { STATE = STATE_TAG; goto state_tag; }
>> N+ { add_next_index_stringl(return_value, start , xp -
>start , 1); goto state_plain; }
>>*/
>>
>>state_tag:
>> start = YYCURSOR;
>>
>>// start -> xp contains currunt pos,
>>// needs to deal with comments !-- and ?xml or php etc.
>>/*!re2c
>> [/!]? alphanumeric+ { MAKE_STD_ZVAL(tag); array_init(tag);
>add_next_index_stringl(tag, start, xp - start, 1); goto state_next_arg_begin; }
>> "!" "-" "-" { MAKE_STD_ZVAL(tag); array_init(tag);
>add_next_index_stringl(tag, start, xp - start, 1); goto state_comment_begin; }
>> any { add_next_index_stringl(return_value, "<",1 , 1); --YYCURSOR;
>goto state_plain_begin; }
>>*/
>>
>>
>>
>>state_comment_begin:
>> start = YYCURSOR;
>>
>>state_comment_next:
>>
>>/*!re2c
>> "-" "-" ">" { add_next_index_stringl(tag, start, xp - start -3, 1);
>add_next_index_zval(return_value, tag); goto state_plain_begin; }
>> any { goto state_comment_next; }
>>*/
>>
>>state_next_arg_begin:
>> STATE = STATE_NEXT_ARG;
>>
>>// at first bit after < or just after a name or name='xxxx'
>>state_next_arg:
>> start = YYCURSOR;
>>/*!re2c
>> ">" { add_next_index_zval(return_value, tag); goto state_plain_begin; }
>> [ \v\t\n]+ { goto state_next_arg; }
>> alpha { --YYCURSOR; STATE = STATE_ARG; goto state_arg; }
>> "/" { MAKE_STD_ZVAL(attribute); array_init(attribute);
>add_next_index_stringl(attribute, start, xp - start, 1);add_next_index_zval(tag,
>attribute); goto state_next_arg; }
>> ["] (any\["])* ["] { MAKE_STD_ZVAL(attribute); array_init(attribute);
>add_next_index_stringl(attribute, start + 1, xp - start -2, 1);
>add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag, attribute);
>goto state_next_arg_begin; }
>> ['] (any\['])* ['] { MAKE_STD_ZVAL(attribute); array_init(attribute);
>add_next_index_stringl(attribute, start + 1, xp - start -2, 1);
>add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag, attribute);
>goto state_next_arg_begin; }
>> any { add_next_index_zval(return_value, tag); goto state_plain_begin; }
>>*/
>>
>>state_arg:
>> start = YYCURSOR;
>>/*!re2c
>> alpha+ { MAKE_STD_ZVAL(attribute); array_init(attribute);
>add_next_index_stringl(attribute, start, xp - start, 1); STATE = STATE_BEFORE_VAL;
>goto state_before_val; }
>> any { --YYCURSOR; STATE = STATE_ARG; goto state_next_arg; }
>>*/
>>
>>state_before_val:
>> start = YYCURSOR;
>>/*!re2c
>> [ ]* "=" [ ]* { STATE = STATE_VAL; goto state_val; }
>> any { add_next_index_zval(tag, attribute); --YYCURSOR; goto
>state_next_arg_begin; }
>>*/
>>
>>
>>state_val:
>> start = YYCURSOR;
>>/*!re2c
>> ["] (any\["])* ["] { add_next_index_stringl(attribute, start + 1, xp - start -2,
>1); add_next_index_stringl(attribute, "\"", 1, 1); add_next_index_zval(tag,
>attribute); goto state_next_arg_begin; }
>> ['] (any\['])* ['] { add_next_index_stringl(attribute, start + 1, xp - start -2,
>1); add_next_index_stringl(attribute, "'", 1, 1); add_next_index_zval(tag,
>attribute); goto state_next_arg_begin; }
>> (any\[ \n>"'])+ { add_next_index_stringl(attribute, start, xp - start, 1);
>add_next_index_zval(tag, attribute); goto state_next_arg_begin; }
>> any { add_next_index_zval(tag, attribute); --YYCURSOR; goto
>state_next_arg_begin; }
>>*/
>>
>>stop:
>> // should do a bit of checking - adding loose attribute or tags to return
>value....
>>
>>
>>}
>>
>>----
>>
>>
>>
>
>
>
>>--
>>PHP Development Mailing List <http://www.php.net/>
>>To unsubscribe, visit: http://www.php.net/unsub.php
>>
>>
--
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, visit: http://www.php.net/unsub.php