On Thu, 10 Jun 2004 16:33:13 +0300, Alexander Valyalkin <[EMAIL PROTECTED]> wrote:

Today I wrote the new version of strip_tags().
Yes, it is not ideal, but it is much better than current version.

Below is my complete version of strip_tags() with testcases. You can add / change
any testcases and compare speed & results of current strip_tags() to my one.


Sorry, but I stripped out majority of comments, because they were in russian :)

====================cut====================
/***************************************************/
/* test strings */
/***************************************************/
char *s[] = {
"", /* empty string */
"a", /* one character */
"<", /* single &lt; char */
"ab", /* two chars */
"test<b", /* incomplete tag */
"test<b title='asdf ", /* incomplete single quotes */
"test<b title=\"add", /* incomplete double quotes */
"test<!-- sdf ", /* incomplete comment */
"test<? echo 'hello' ", /* incomplete php-tag */
"test<% echo 'hello' ", /* incomplete asp-tag */
"test<% $a = 'ss ", /* incomplete php-string in single quotes */
"test<?php $a = \"12\\\"3", /* incomplete php-string in double quotes */
"test<? // comment", /* incomplete single-line comment */
"test<? # comment ", /* incomplete single-line comment */
"test<? /* comment\n** 23", /* incomplete multi-line comment */
"test<? $a = `ls -l", /* incomplete quotes */
"test<? $a=<<<FOO\nssdf", /* incomplete HEREDOC */
"test<script>if (1<b) alert('<b>ee</b>');", /* incomplete <script> tag */
"test<StYle>div {font-weight:bold; }", /* incomplete <style> tag */
"a< b", /* not a tag */
"t<b>es</b>t", /* simple test */
"te<b title='1 > 2' />st",
"<b title=\"1 > 2\">test",
"t<b title='1 < 2'/>est",
"tes<b title = qwe'rt>t",
"t<!-- <<< comment <<< -->est",
"<!-- >>> <b>comment</b> <<< -->test",
"t<? echo '?>' ?>est",
"<?='\"a\\'b' ?>test",
"te<% $a = \"?>'%>\"; // comment1\n // comment2 %>st",
"t<?php \n # here is comment ?>est",
"te<?=\"dd\\\"d'?>d%>d\" ?>st",
"<?php $a = <<<END\n`t's\"q?>t\nEND;\n ?>test",
"tes<? /* co'm\\m\\\"n`e\"t \n multi line \n */ test ?>t",
"<? print `sd\\`a'd\\\"d`; ?>test",
"t<scrIpT type =\n 'text/javascript'>if (a <b ) alert('hello')</ScRipt>est",
"te<sTyLe>#a { color: red; }</style>st",
"t<?xml version=\"1.0\"?>est",
"<?xml version=\"1.'0\\\" encoding='UT\"F-8\\' ?>test",
NULL
};


char *allow = "<a><b><c>"; /* allowable tags */
/***************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define PHPAPI

#define PHP_MAX_HEREDOC_LEN 32
#define PHP_MAX_TAG_LEN 32

void php_tag_find(char *allow, size_t allow_len, char *tag_name_begin, char *tag_name_ptr, char *src_ptr, char **dst_ptr) {
size_t tag_len;
char *tmp_ptr;
size_t is_end_tag = 0;
tag_len = tag_name_ptr - tag_name_begin;
if (*tag_name_begin == '/' && tag_len > 1) is_end_tag = 1;
*tag_name_ptr = '\0';
if (allow_len < 3 || src_ptr - *dst_ptr <= tag_len || tag_len < 1) return;
tmp_ptr = strstr(allow + 1, is_end_tag ? tag_name_begin + 1 : tag_name_begin);
if (tmp_ptr != NULL && tmp_ptr + tag_len - is_end_tag < allow + allow_len &&
*(tmp_ptr + tag_len - is_end_tag) == '>' && *(tmp_ptr - 1) == '<') {
*(*dst_ptr)++ = '<';
memcpy(*dst_ptr, tag_name_begin, tag_len);
*dst_ptr += tag_len;
if (*(src_ptr - 2) == ' ') *(*dst_ptr)++ = ' ';
if (*(src_ptr - 1) == '/') *(*dst_ptr)++ = '/';
*(*dst_ptr)++ = '>';
}
}


PHPAPI size_t php_strip_all_tags(char *rbuf, int len, int *stateptr, char *allow, int allow_len)
{
char *src_begin = rbuf,
*src_ptr = rbuf,
*src_end = rbuf + (size_t) len;
char *dst_ptr = rbuf;
int state;
if (stateptr != NULL) state = *stateptr;
else state = 0;
static char tag_name_begin[PHP_MAX_TAG_LEN + 1],
*tag_name_ptr = NULL,
*tag_name_end = NULL;
size_t tag_len = 0;
if (tag_name_ptr == NULL) tag_name_ptr = tag_name_begin;
if (tag_name_end == NULL) tag_name_end = tag_name_begin + PHP_MAX_TAG_LEN;
static char heredoc_name_begin[PHP_MAX_HEREDOC_LEN + 1],
*heredoc_name_ptr = NULL,
*heredoc_name_end = NULL;
if (heredoc_name_ptr == NULL) heredoc_name_ptr = heredoc_name_begin;
if (heredoc_name_end == NULL) heredoc_name_end = heredoc_name_begin + PHP_MAX_HEREDOC_LEN;
char ch;
while (src_ptr < src_end) {
ch = *src_ptr;
switch (ch) {
case '#' :
switch (state) {
case 4 : state = 18; break;
}
break;
case '-' :
switch (state) {
case 10 :
if ((src_ptr - src_begin) > 2 && *(src_ptr - 1) == '-' && *(src_ptr - 2) == '!' &&
*(src_ptr - 3) == '<') state = 9;
break;
}
break;
case '\r' :
case '\n' :
switch (state) {
case 8 :
case 18 : state = 4; break;
case 10 : state = 1; break;
}
break;
case ' ' :
case '\t' :
case '\v' :
case '\f' :
switch (state) {
case 10 : state = 1; break;
}
break;
case '*' :
switch (state) {
case 4 : if (*(src_ptr - 1) == '/') state = 7; break;
}
break;
case '/' :
switch (state) {
case 4 : if (*(src_ptr - 1) == '/') state = 8; break;
case 10 : if (*(src_ptr - 1) != '<') state = 1; break;
}
break;
case '\\' :
switch (state) {
case 5 :
case 6 :
case 17 :
if (src_ptr < src_end) src_ptr++; break;
}
break;
case '%' :
case '?' :
switch (state) {
case 10 :
if (*(src_ptr - 1) == '<' && src_ptr + 1 < src_end && tolower(*(src_ptr + 1)) != 'x') {
if (tag_name_ptr < tag_name_end) *tag_name_ptr++ = ch;
state = 4;
}
break;
}
break;
case '<' :
switch (state) {
case 0 : if (src_end - src_ptr > 1 && !isspace(*(src_ptr + 1))) state = 13; break;
case 4 :
if ((src_ptr + 2 < src_end) && *(src_ptr + 1) == '<' && *(src_ptr + 2) == '<') {
state = 15;
src_ptr += 2;
}
break;
}
break;
case '>' :
switch (state) {
case 4 :
case 8 :
case 18 : if (*(src_ptr - 1) == *tag_name_begin) state = 14; break;
case 1 :
case 10 :
if (tag_name_ptr - tag_name_begin == 6 &&
!memcmp(tag_name_begin, "script", 6)) state = 11;
else if (tag_name_ptr - tag_name_begin == 5 &&
!memcmp(tag_name_begin, "style", 5)) state = 12;
else {
php_tag_find(allow, (size_t) allow_len, tag_name_begin, tag_name_ptr, src_ptr, &dst_ptr);
state = 14;
}
break;
}
break;
case '"' :
switch (state) {
case 1 : if (*(src_ptr - 1) == '=' || isspace(*(src_ptr - 1))) state = 2; break;
case 4 : state = 5; break;
case 5 : state = 4; break;
}
break;
case '\'' :
switch (state) {
case 1 : if (*(src_ptr - 1) == '=' || isspace(*(src_ptr - 1))) state = 3; break;
case 4 : state = 6; break;
case 6 : state = 4; break;
}
break;
case '`' :
switch (state) {
case 4 : state = 17; break;
case 17 : state = 4; break;
}
break;
}
switch (state) {
case 0 : *dst_ptr++ = ch; break;
case 2 :
src_ptr++;
src_ptr = memchr(src_ptr, '"', src_end - src_ptr);
if (src_ptr == NULL) src_ptr = src_end;
else state = 1;
break;
case 3 :
src_ptr++;
src_ptr = memchr(src_ptr, '\'', src_end - src_ptr);
if (src_ptr == NULL) src_ptr = src_end;
else state = 1;
break;
case 7 :
src_ptr++;
while (src_ptr < src_end) {
src_ptr = memchr(src_ptr, '*', src_end - src_ptr);
if (src_ptr == NULL || src_end - src_ptr < 2) src_ptr = src_end;
else {
src_ptr++;
if (*src_ptr == '/') break;
}
}
if (src_ptr < src_end) state = 4;
break;
case 9 :
src_ptr++;
while (src_ptr < src_end) {
src_ptr = memchr(src_ptr, '-', src_end - src_ptr);
if (src_ptr == NULL || src_end - src_ptr < 3) src_ptr = src_end;
else {
src_ptr++;
if (*src_ptr == '-' && *(src_ptr + 1) == '>') break;
}
}
if (src_ptr < src_end) {
src_ptr++;
state = 0;
}
break;
case 10 : if (tag_name_ptr < tag_name_end) *tag_name_ptr++ = tolower(ch); break;
case 11 :
src_ptr++;
while (src_ptr < src_end) {
src_ptr = memchr(src_ptr, '<', src_end - src_ptr);
if (src_ptr == NULL || src_end - src_ptr < 8) src_ptr = src_end;
else {
src_ptr++;
if (src_ptr[0] == '/' && tolower(src_ptr[1]) == 's' && tolower(src_ptr[2]) == 'c' &&
tolower(src_ptr[3]) == 'r' && tolower(src_ptr[4]) == 'i' && tolower(src_ptr[5]) == 'p' &&
tolower(src_ptr[6]) == 't') break;
}
}
if (src_ptr < src_end) {
src_ptr += 6;
tag_name_ptr = tag_name_end;
state = 1;
}
break;
case 12 :
src_ptr++;
while (src_ptr < src_end) {
src_ptr = memchr(src_ptr, '<', src_end - src_ptr);
if (src_ptr == NULL || src_end - src_ptr < 7) src_ptr = src_end;
else {
src_ptr++;
if (src_ptr[0] == '/' && tolower(src_ptr[1]) == 's' && tolower(src_ptr[2]) == 't' &&
tolower(src_ptr[3]) == 'y' && tolower(src_ptr[4]) == 'l' && tolower(src_ptr[5]) == 'e') break;
}
}
if (src_ptr < src_end) {
src_ptr += 5;
tag_name_ptr = tag_name_end;
state = 1;
}
break;
case 13 :
tag_name_ptr = tag_name_begin;
state = 10;
break;
case 14 : state = 0; break;
case 15 :
src_ptr++;
heredoc_name_ptr = heredoc_name_begin;
while (src_ptr < src_end && (*src_ptr == ' ' || *src_ptr == '\t')) src_ptr++;
if (src_ptr < src_end) {
while (src_ptr < src_end && heredoc_name_ptr < heredoc_name_end &&
isalnum(*src_ptr)) *heredoc_name_ptr++ = *src_ptr++;
if (src_ptr < src_end && isalpha(*heredoc_name_begin)) {
*heredoc_name_ptr++ = '\0';
src_ptr = strstr(src_ptr, heredoc_name_begin);
if (src_ptr == NULL) {
src_ptr = src_end;
state = 16;
} else {
src_ptr += heredoc_name_ptr - heredoc_name_begin;
state = 4;
}
} else state = 4;
}
break;
case 16 :
src_ptr = strstr(src_ptr, heredoc_name_begin);
if (src_ptr == NULL) src_ptr = src_end;
else {
src_ptr += heredoc_name_ptr - heredoc_name_begin;
state = 4;
}
break;
}
src_ptr++;
}
*dst_ptr = '\0';
if (stateptr != NULL) *stateptr = state;
return (size_t) (dst_ptr - src_begin);
}


/***************************************************/
int main(int argc,char *argv[])
{
    int i = 0;
    char *s1;
    size_t len_old, len_new, allow_len;
    int state;

allow_len = strlen(allow);
s1 = (char *) malloc(1);
len_old = 0;
*s1 = '\0';
while (s[i] != NULL) {
printf("str_num=%d, ", i);
state = 0; /* set state to 0 */
len_new = strlen(s[i]);
if (len_new > len_old) s1 = (char *) realloc(s1, len_new + 1);
strcpy(s1, s[i]);
// printf("src=[%s], ", s1);
len_old = php_strip_all_tags(s1, len_new, &state, allow, allow_len);
printf("dst=[%s], src_len=%d, dst_len=%d, state=%d\n", s1, len_new, len_old, state);
len_old = len_new;
i++;
}
free(s1);
return 0;
}
====================cut====================


--
Using Opera's revolutionary e-mail client: http://www.opera.com/m2/

--
PHP Internals - PHP Runtime Development Mailing List
To unsubscribe, visit: http://www.php.net/unsub.php



Reply via email to