Author: joes
Date: Mon Aug 22 09:55:18 2005
New Revision: 235750
URL: http://svn.apache.org/viewcvs?rev=235750&view=rev
Log:
Factor out the charset detection code from url_decode
to apreq_charset_divine. It's much cleaner to do the
charset heuristics after decoding the string (instead of
doing it while decoding).
Modified:
httpd/apreq/trunk/CHANGES
httpd/apreq/trunk/include/apreq_util.h
httpd/apreq/trunk/include/apreq_version.h
httpd/apreq/trunk/library/t/util.c
httpd/apreq/trunk/library/util.c
Modified: httpd/apreq/trunk/CHANGES
URL:
http://svn.apache.org/viewcvs/httpd/apreq/trunk/CHANGES?rev=235750&r1=235749&r2=235750&view=diff
==============================================================================
--- httpd/apreq/trunk/CHANGES (original)
+++ httpd/apreq/trunk/CHANGES Mon Aug 22 09:55:18 2005
@@ -6,6 +6,9 @@
- C API [joes]
+ Add apreq_charset_divine().
+
+- C API [joes]
Improve the cp1252-charset heuristics for apreq_decode(v).
- C API [Ralph Mattes]
Modified: httpd/apreq/trunk/include/apreq_util.h
URL:
http://svn.apache.org/viewcvs/httpd/apreq/trunk/include/apreq_util.h?rev=235750&r1=235749&r2=235750&view=diff
==============================================================================
--- httpd/apreq/trunk/include/apreq_util.h (original)
+++ httpd/apreq/trunk/include/apreq_util.h Mon Aug 22 09:55:18 2005
@@ -129,6 +129,20 @@
const char *src, apr_size_t
slen);
/**
+ * Heuristically determine the charset of a string.
+ *
+ * @param src String to scan.
+ * @param slen Length of string.
+ *
+ * @return APREQ_CHARSET_ASCII if the string contains only 7-bit chars;
+ * @return APREQ_CHARSET_UTF8 if the string is a valid utf8 byte sequence;
+ * @return APREQ_CHARSET_LATIN1 if the string has no control chars;
+ * @return APREQ_CHARSET_CP1252 if the string has control chars.
+ */
+APREQ_DECLARE(apreq_charset_t) apreq_charset_divine(const unsigned char *src,
+ apr_size_t slen);
+
+/**
* Url-decodes a string.
*
* @param dest Location of url-encoded result string. Caller must ensure dest
is
Modified: httpd/apreq/trunk/include/apreq_version.h
URL:
http://svn.apache.org/viewcvs/httpd/apreq/trunk/include/apreq_version.h?rev=235750&r1=235749&r2=235750&view=diff
==============================================================================
--- httpd/apreq/trunk/include/apreq_version.h (original)
+++ httpd/apreq/trunk/include/apreq_version.h Mon Aug 22 09:55:18 2005
@@ -58,10 +58,10 @@
* Minor API changes that do not cause binary compatibility problems.
* Should be reset to 0 when upgrading APREQ_MAJOR_VERSION
*/
-#define APREQ_MINOR_VERSION 3
+#define APREQ_MINOR_VERSION 4
/** patch level */
-#define APREQ_PATCH_VERSION 1
+#define APREQ_PATCH_VERSION 0
/**
* This symbol is defined for internal, "development" copies of libapreq.
Modified: httpd/apreq/trunk/library/t/util.c
URL:
http://svn.apache.org/viewcvs/httpd/apreq/trunk/library/t/util.c?rev=235750&r1=235749&r2=235750&view=diff
==============================================================================
--- httpd/apreq/trunk/library/t/util.c (original)
+++ httpd/apreq/trunk/library/t/util.c Mon Aug 22 09:55:18 2005
@@ -107,7 +107,7 @@
apr_status_t status;
status = apreq_decodev(dest, &dest_len, iovec1, 3);
- AT_int_eq(status, APR_SUCCESS + APREQ_CHARSET_UTF8);
+ AT_int_eq(status, APR_SUCCESS);
AT_int_eq(dest_len, sizeof(expect1) - 1);
AT_mem_eq(dest, expect1, sizeof(expect1) - 1);
Modified: httpd/apreq/trunk/library/util.c
URL:
http://svn.apache.org/viewcvs/httpd/apreq/trunk/library/util.c?rev=235750&r1=235749&r2=235750&view=diff
==============================================================================
--- httpd/apreq/trunk/library/util.c (original)
+++ httpd/apreq/trunk/library/util.c Mon Aug 22 09:55:18 2005
@@ -217,74 +217,108 @@
/**
- * Valid utf8 bit patterns:
+ * Valid utf8 bit patterns: (true utf8 must satisfy a minimality condition)
*
* 0xxxxxxx
- * 110xxxxx 10xxxxxx
- * 1110xxxx 10xxxxxx 10xxxxxx
- * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 110xxxxx 10xxxxxx minimality mask: 0x1E
+ * 1110xxxx 10xxxxxx 10xxxxxx 0x0F || 0x20
+ * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 0x07 || 0x30
+ * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 0x03 || 0x38
+ * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 0x01 || 0x3C
+ *
+ * Charset divination heuristics:
+ * 1) presume ascii; if not, then
+ * 2) presume utf8; if not, then
+ * 3) presume latin1; unless there are control chars, in which case
+ * 4) punt to cp1252.
+ *
+ * Note: in downgrading from 2 to 3, we need to be careful
+ * about earlier control characters presumed to be valid utf8.
*/
-static APR_INLINE unsigned is_89AB(const char c)
-{
-
- switch(c) {
- case '8':
- case '9':
- case 'A':
- case 'B':
- case 'a':
- case 'b':
- return 1;
- }
- return 0;
-}
+APREQ_DECLARE(apreq_charset_t) apreq_charset_divine(const unsigned char *src,
+ apr_size_t slen)
-static APR_INLINE unsigned is_enc8(const char *word, unsigned char wlen)
{
+ apreq_charset_t rv = APREQ_CHARSET_ASCII;
+ register unsigned char trail = 0, saw_cntrl = 0, mask = 0;
+ const unsigned char *end = src + slen;
+
+ for (; src < end; ++src) {
+ if (trail) {
+ if ((*src & 0xC0) == 0x80 && (mask == 0 || (mask & *src))) {
+ mask = 0;
+ --trail;
- while (wlen-- > 0) {
- if (word[0] == '%' && is_89AB(word[1]) && apr_isxdigit(word[2]))
- word += 3;
- else
- return 0;
- }
- return 1;
-}
+ if ((*src & 0xE0) == 0x80) {
+ saw_cntrl = 1;
+ }
+ }
+ else {
+ trail = 0;
+ if (saw_cntrl)
+ return APREQ_CHARSET_CP1252;
+ rv = APREQ_CHARSET_LATIN1;
+ }
+ }
+ else if (*src < 0x80) {
+ /* do nothing */
+ }
+ else if (*src < 0xA0) {
+ return APREQ_CHARSET_CP1252;
+ }
+ else if (*src < 0xC0) {
+ if (saw_cntrl)
+ return APREQ_CHARSET_CP1252;
+ rv = APREQ_CHARSET_LATIN1;
+ }
+ else if (rv == APREQ_CHARSET_LATIN1) {
+ /* do nothing */
+ }
-static APR_INLINE unsigned is_enc8_fragment(const char *word,
- const char *end)
-{
- unsigned char flen = end - word;
- unsigned char wlen = flen / 3;
- if (!is_enc8(word, wlen))
- return 0;
-
- switch (flen % 3) {
- case 2:
- if (!is_89AB(*--end))
- return 0;
- case 1:
- if (*--end != '%')
- return 0;
+ /* utf8 cases */
+
+ else if (*src < 0xE0) {
+ if (*src & 0x1E) {
+ rv = APREQ_CHARSET_UTF8;
+ trail = 1;
+ mask = 0;
+ }
+ else if (saw_cntrl)
+ return APREQ_CHARSET_CP1252;
+ else
+ rv = APREQ_CHARSET_LATIN1;
+ }
+ else if (*src < 0xF0) {
+ mask = (*src & 0x0F) ? 0 : 0x20;
+ rv = APREQ_CHARSET_UTF8;
+ trail = 2;
+ }
+ else if (*src < 0xF8) {
+ mask = (*src & 0x07) ? 0 : 0x30;
+ rv = APREQ_CHARSET_UTF8;
+ trail = 3;
+ }
+ else if (*src < 0xFC) {
+ mask = (*src & 0x03) ? 0 : 0x38;
+ rv = APREQ_CHARSET_UTF8;
+ trail = 4;
+ }
+ else if (*src < 0xFE) {
+ mask = (*src & 0x01) ? 0 : 0x3C;
+ rv = APREQ_CHARSET_UTF8;
+ trail = 5;
+ }
+ else {
+ rv = APREQ_CHARSET_UTF8;
+ }
}
- return 1;
-}
-/* look for chars between 0x80 and 0x9F, inclusive */
-static APR_INLINE unsigned has_cntrl(const unsigned char *start,
- const unsigned char *end)
-{
- while (start <= end)
- if ((*start++ & 0xE0) == 0x80)
- return 1;
- return 0;
+ return trail ? saw_cntrl ?
+ APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1 : rv;
}
-
static APR_INLINE apr_uint16_t hex4_to_bmp(const char *what) {
register apr_uint16_t digit = 0;
@@ -312,20 +346,7 @@
}
-/*
- * Charset divination heuristics:
- * 1) presume ascii; if not, then
- * 2) presume utf8; if not, then
- * 3) presume latin1; unless there are control chars, in which case
- * 4) punt to cp1252.
- *
- * Note: in downgrading from 2 to 3, we need to be careful
- * about earlier control characters presumed to be valid utf8.
- */
-
-
static apr_status_t url_decode(char *dest, apr_size_t *dlen,
- apreq_charset_t *charset,
const char *src, apr_size_t *slen)
{
register const char *s = src;
@@ -343,185 +364,8 @@
case '%':
if (s + 2 < end && apr_isxdigit(s[1]) && apr_isxdigit(s[2]))
{
- unsigned char c;
- c = hex2_to_char(s + 1);
+ *d = hex2_to_char(s + 1);
s += 2;
- if (c < 0x80 || *charset == APREQ_CHARSET_CP1252)
- {
- *d = c;
- }
- else if (c < 0xA0) {
- /* these are ctrl chars in latin1 */
- *charset = APREQ_CHARSET_CP1252;
- *d = c;
- }
- else if (c < 0xC0) {
- *charset = APREQ_CHARSET_LATIN1;
- *d = c;
- }
- else if (*charset == APREQ_CHARSET_LATIN1) {
- *d = c;
- }
-
- /* utf8 cases */
-
- else if (c < 0xE0) {
- /* 2-byte utf8 */
- if (s + 3 >= end) {
- if (is_enc8_fragment(s+1, end)) {
- *charset = APREQ_CHARSET_UTF8;
- s -= 2;
- *dlen = d - start;
- *slen = s - src;
- memmove(d, s, end - s);
- d[end - s] = 0;
- return APR_INCOMPLETE;
- }
- *d = c;
- *charset = has_cntrl((unsigned char *)dest, d)
- ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
- }
- else if (is_enc8(s+1, 1)) {
- *charset = APREQ_CHARSET_UTF8;
- *d++ = c;
- *d = hex2_to_char(s+2);
- s += 3;
- }
- else {
- *d = c;
- *charset = has_cntrl((unsigned char *)dest, d)
- ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
- }
- }
- else if (c < 0xF0) {
- /* 3-byte utf8 */
- if (s + 6 >= end) {
- if (is_enc8_fragment(s+1, end)) {
- *charset = APREQ_CHARSET_UTF8;
- s -= 2;
- *dlen = d - start;
- *slen = s - src;
- memmove(d, s, end - s);
- d[end - s] = 0;
- return APR_INCOMPLETE;
- }
- *d = c;
- *charset = has_cntrl((unsigned char *)dest, d)
- ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
- }
- else if (is_enc8(s+1, 2)) {
- *charset = APREQ_CHARSET_UTF8;
- *d++ = c;
- *d++ = hex2_to_char(s+2);
- *d = hex2_to_char(s+5);
- s += 6;
- }
- else {
- *d = c;
- *charset = has_cntrl((unsigned char *)dest, d)
- ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
- }
-
- }
- else if (c < 0xF8) {
- /* 4-byte utf8 */
- if (s + 9 >= end) {
- if (is_enc8_fragment(s+1, end)) {
- *charset = APREQ_CHARSET_UTF8;
- s -= 2;
- *dlen = d - start;
- *slen = s - src;
- memmove(d, s, end - s);
- d[end - s] = 0;
- return APR_INCOMPLETE;
- }
- *d = c;
- *charset = has_cntrl((unsigned char *)dest, d)
- ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
- }
- else if (is_enc8(s+1, 3)) {
- *charset = APREQ_CHARSET_UTF8;
- *d++ = c;
- *d++ = hex2_to_char(s+2);
- *d++ = hex2_to_char(s+5);
- *d = hex2_to_char(s+8);
- s += 9;
- }
- else {
- *d = c;
- *charset = has_cntrl((unsigned char *)dest, d)
- ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
- }
-
- }
- else if (c < 0xFC) {
- /* 5-byte utf8 */
- if (s + 12 >= end) {
- if (is_enc8_fragment(s+1, end)) {
- *charset = APREQ_CHARSET_UTF8;
- s -= 2;
- *dlen = d - start;
- *slen = s - src;
- memmove(d, s, end - s);
- d[end - s] = 0;
- return APR_INCOMPLETE;
- }
- *d = c;
- *charset = has_cntrl((unsigned char *)dest, d)
- ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
- }
- else if (is_enc8(s+1, 4)) {
- *charset = APREQ_CHARSET_UTF8;
- *d++ = c;
- *d++ = hex2_to_char(s+2);
- *d++ = hex2_to_char(s+5);
- *d++ = hex2_to_char(s+8);
- *d = hex2_to_char(s+11);
- s += 12;
- }
- else {
- *d = c;
- *charset = has_cntrl((unsigned char *)dest, d)
- ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
- }
-
- }
- else if (c < 0xFE) {
- /* 6-byte utf8 */
- if (s + 15 >= end) {
- if (is_enc8_fragment(s+1, end)) {
- *charset = APREQ_CHARSET_UTF8;
- s -= 2;
- *dlen = d - start;
- *slen = s - src;
- memmove(d, s, end - s);
- d[end - s] = 0;
- return APR_INCOMPLETE;
- }
- *d = c;
- *charset = has_cntrl((unsigned char *)dest, d)
- ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
- }
- else if (is_enc8(s+1, 5)) {
- *charset = APREQ_CHARSET_UTF8;
- *d++ = c;
- *d++ = hex2_to_char(s+2);
- *d++ = hex2_to_char(s+5);
- *d++ = hex2_to_char(s+8);
- *d++ = hex2_to_char(s+11);
- *d = hex2_to_char(s+14);
- s += 15;
- }
- else {
- *d = c;
- *charset = has_cntrl((unsigned char *)dest, d)
- ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
- }
- }
- else {
- /* (skipped) utf8 byte-order mark */
- *charset = APREQ_CHARSET_UTF8;
- }
}
else if (s + 5 < end && (s[1] == 'u' || s[1] == 'U') &&
apr_isxdigit(s[2]) && apr_isxdigit(s[3]) &&
@@ -529,20 +373,6 @@
{
apr_uint16_t c = hex4_to_bmp(s+2);
- switch (*charset) {
- case APREQ_CHARSET_ASCII:
- *charset = APREQ_CHARSET_UTF8;
- case APREQ_CHARSET_UTF8:
- break;
-
- default:
- *dlen = d - start;
- *slen = s - src;
- *d = 0;
- return APREQ_ERROR_BADSEQ;
- }
-
-
if (c < 0x80) {
*d = c;
}
@@ -601,7 +431,6 @@
apr_size_t len = 0;
const char *end = s + slen;
apr_status_t rv;
- apreq_charset_t c = APREQ_CHARSET_ASCII;
if (s == (const char *)d) { /* optimize for src = dest case */
for ( ; d < end; ++d) {
@@ -617,26 +446,15 @@
slen -= len;
}
- rv = url_decode(d, dlen, &c, s, &slen);
-
- if (rv == APR_INCOMPLETE && c == APREQ_CHARSET_UTF8) {
- c = APREQ_CHARSET_LATIN1;
- len += *dlen;
- d += *dlen;
- slen = end - (s + slen);
- rv = url_decode(d, dlen, &c, d, &slen);
- }
-
- *dlen += len;
-
- return rv + c;
+ rv = url_decode(d, dlen, s, &slen);
+ return rv + apreq_charset_divine((unsigned char *)d, *dlen);
}
APREQ_DECLARE(apr_status_t) apreq_decodev(char *d, apr_size_t *dlen,
struct iovec *v, int nelts)
{
apr_status_t status = APR_SUCCESS;
- apreq_charset_t c = APREQ_CHARSET_ASCII;
+ const unsigned char *dest = (unsigned char *)d;
int n = 0;
*dlen = 0;
@@ -645,7 +463,7 @@
apr_size_t slen, len;
slen = v[n].iov_len;
- switch (status = url_decode(d,&len, &c, v[n].iov_base, &slen)) {
+ switch (status = url_decode(d, &len, v[n].iov_base, &slen)) {
case APR_SUCCESS:
d += len;
@@ -659,12 +477,7 @@
slen = v[n].iov_len - slen;
if (++n == nelts) {
- if (c == APREQ_CHARSET_UTF8) {
- c = APREQ_CHARSET_LATIN1;
- status = url_decode(d, &len, &c, d, &slen);
- *dlen += len;
- }
- return status + c;
+ return status;
}
memcpy(d + slen, v[n].iov_base, v[n].iov_len);
v[n].iov_len += slen;
@@ -677,7 +490,7 @@
}
}
- return status + c;
+ return status + apreq_charset_divine(dest, *dlen);
}