Author: joes
Date: Mon Aug 22 09:55:18 2005
New Revision: 235750

URL: http://svn.apache.org/viewcvs?rev=235750&view=rev
Log:
Factor out the charset detection code from url_decode
to apreq_charset_divine.  It's much cleaner to do the 
charset heuristics after decoding the string (instead of
doing it while decoding).

Modified:
    httpd/apreq/trunk/CHANGES
    httpd/apreq/trunk/include/apreq_util.h
    httpd/apreq/trunk/include/apreq_version.h
    httpd/apreq/trunk/library/t/util.c
    httpd/apreq/trunk/library/util.c

Modified: httpd/apreq/trunk/CHANGES
URL: 
http://svn.apache.org/viewcvs/httpd/apreq/trunk/CHANGES?rev=235750&r1=235749&r2=235750&view=diff
==============================================================================
--- httpd/apreq/trunk/CHANGES (original)
+++ httpd/apreq/trunk/CHANGES Mon Aug 22 09:55:18 2005
@@ -6,6 +6,9 @@
 
 
 - C API [joes]
+  Add apreq_charset_divine().
+
+- C API [joes]
   Improve the cp1252-charset heuristics for apreq_decode(v).
 
 - C API [Ralph Mattes]

Modified: httpd/apreq/trunk/include/apreq_util.h
URL: 
http://svn.apache.org/viewcvs/httpd/apreq/trunk/include/apreq_util.h?rev=235750&r1=235749&r2=235750&view=diff
==============================================================================
--- httpd/apreq/trunk/include/apreq_util.h (original)
+++ httpd/apreq/trunk/include/apreq_util.h Mon Aug 22 09:55:18 2005
@@ -129,6 +129,20 @@
                                                const char *src, apr_size_t 
slen);
 
 /**
+ * Heuristically determine the charset of a string.
+ *
+ * @param src  String to scan.
+ * @param slen Length of string.
+ *
+ * @return APREQ_CHARSET_ASCII  if the string contains only 7-bit chars;
+ * @return APREQ_CHARSET_UTF8   if the string is a valid utf8 byte sequence;
+ * @return APREQ_CHARSET_LATIN1 if the string has no control chars;
+ * @return APREQ_CHARSET_CP1252 if the string has control chars.
+ */
+APREQ_DECLARE(apreq_charset_t) apreq_charset_divine(const unsigned char *src,
+                                                    apr_size_t slen);
+
+/**
  * Url-decodes a string.
  *
  * @param dest Location of url-encoded result string. Caller must ensure dest 
is

Modified: httpd/apreq/trunk/include/apreq_version.h
URL: 
http://svn.apache.org/viewcvs/httpd/apreq/trunk/include/apreq_version.h?rev=235750&r1=235749&r2=235750&view=diff
==============================================================================
--- httpd/apreq/trunk/include/apreq_version.h (original)
+++ httpd/apreq/trunk/include/apreq_version.h Mon Aug 22 09:55:18 2005
@@ -58,10 +58,10 @@
  * Minor API changes that do not cause binary compatibility problems.
  * Should be reset to 0 when upgrading APREQ_MAJOR_VERSION
  */
-#define APREQ_MINOR_VERSION       3
+#define APREQ_MINOR_VERSION       4
 
 /** patch level */
-#define APREQ_PATCH_VERSION       1
+#define APREQ_PATCH_VERSION       0
 
 /**
  *  This symbol is defined for internal, "development" copies of libapreq.

Modified: httpd/apreq/trunk/library/t/util.c
URL: 
http://svn.apache.org/viewcvs/httpd/apreq/trunk/library/t/util.c?rev=235750&r1=235749&r2=235750&view=diff
==============================================================================
--- httpd/apreq/trunk/library/t/util.c (original)
+++ httpd/apreq/trunk/library/t/util.c Mon Aug 22 09:55:18 2005
@@ -107,7 +107,7 @@
     apr_status_t status;
 
     status = apreq_decodev(dest, &dest_len, iovec1, 3);
-    AT_int_eq(status, APR_SUCCESS + APREQ_CHARSET_UTF8);
+    AT_int_eq(status, APR_SUCCESS);
     AT_int_eq(dest_len, sizeof(expect1) - 1);
     AT_mem_eq(dest, expect1, sizeof(expect1) - 1);
 

Modified: httpd/apreq/trunk/library/util.c
URL: 
http://svn.apache.org/viewcvs/httpd/apreq/trunk/library/util.c?rev=235750&r1=235749&r2=235750&view=diff
==============================================================================
--- httpd/apreq/trunk/library/util.c (original)
+++ httpd/apreq/trunk/library/util.c Mon Aug 22 09:55:18 2005
@@ -217,74 +217,108 @@
 
 
 /**
- * Valid utf8 bit patterns:
+ * Valid utf8 bit patterns: (true utf8 must satisfy a minimality condition)
  *
  * 0xxxxxxx
- * 110xxxxx 10xxxxxx
- * 1110xxxx 10xxxxxx 10xxxxxx
- * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 110xxxxx 10xxxxxx                        minimality mask: 0x1E
+ * 1110xxxx 10xxxxxx 10xxxxxx                                0x0F || 0x20
+ * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx                       0x07 || 0x30
+ * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx              0x03 || 0x38
+ * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx     0x01 || 0x3C
+ *
+ * Charset divination heuristics:
+ * 1) presume ascii; if not, then
+ * 2) presume utf8; if not, then
+ * 3) presume latin1; unless there are control chars, in which case
+ * 4) punt to cp1252.
+ *
+ * Note: in downgrading from 2 to 3, we need to be careful
+ * about earlier control characters presumed to be valid utf8.
  */
 
-static APR_INLINE unsigned is_89AB(const char c)
-{
-
-    switch(c) {
-    case '8':
-    case '9':
-    case 'A':
-    case 'B':
-    case 'a':
-    case 'b':
-        return 1;
-    }
-    return 0;
-}
+APREQ_DECLARE(apreq_charset_t) apreq_charset_divine(const unsigned char *src,
+                                                    apr_size_t slen)
 
-static APR_INLINE unsigned is_enc8(const char *word, unsigned char wlen)
 {
+    apreq_charset_t rv = APREQ_CHARSET_ASCII;
+    register unsigned char trail = 0, saw_cntrl = 0, mask = 0;
+    const unsigned char *end = src + slen;
+
+    for (; src < end; ++src) {
+        if (trail) {
+            if ((*src & 0xC0) == 0x80 && (mask == 0 || (mask & *src))) {
+                mask = 0;
+                --trail;
 
-    while (wlen-- > 0) {
-        if (word[0] == '%' && is_89AB(word[1]) && apr_isxdigit(word[2]))
-            word += 3;
-        else
-            return 0;
-    }
-    return 1;
-}
+                if ((*src & 0xE0) == 0x80) {
+                    saw_cntrl = 1;
+                }
+            }
+            else {
+                trail = 0;
+                if (saw_cntrl)
+                    return APREQ_CHARSET_CP1252;
+                rv = APREQ_CHARSET_LATIN1;
+            }
+        }
+        else if (*src < 0x80) {
+            /* do nothing */
+        }
+        else if (*src < 0xA0) {
+            return APREQ_CHARSET_CP1252;
+        }
+        else if (*src < 0xC0) {
+            if (saw_cntrl)
+                return APREQ_CHARSET_CP1252;
+            rv = APREQ_CHARSET_LATIN1;
+        }
+        else if (rv == APREQ_CHARSET_LATIN1) {
+            /* do nothing */
+        }
 
-static APR_INLINE unsigned is_enc8_fragment(const char *word,
-                                            const char *end)
-{
-    unsigned char flen = end - word;
-    unsigned char wlen = flen / 3;
-    if (!is_enc8(word, wlen))
-        return 0;
-
-    switch (flen % 3) {
-    case 2:
-        if (!is_89AB(*--end))
-            return 0;
-    case 1:
-        if (*--end != '%')
-            return 0;
+        /* utf8 cases */
+
+        else if (*src < 0xE0) {
+            if (*src & 0x1E) {
+                rv = APREQ_CHARSET_UTF8;
+                trail = 1;
+                mask = 0;
+            }
+            else if (saw_cntrl)
+                return APREQ_CHARSET_CP1252;
+            else
+                rv = APREQ_CHARSET_LATIN1;
+        }
+        else if (*src < 0xF0) {
+            mask = (*src & 0x0F) ? 0 : 0x20;
+            rv = APREQ_CHARSET_UTF8;
+            trail = 2;
+        }
+        else if (*src < 0xF8) {
+            mask = (*src & 0x07) ? 0 : 0x30;
+            rv = APREQ_CHARSET_UTF8;
+            trail = 3;
+        }
+        else if (*src < 0xFC) {
+            mask = (*src & 0x03) ? 0 : 0x38;
+            rv = APREQ_CHARSET_UTF8;
+            trail = 4;
+        }
+        else if (*src < 0xFE) {
+            mask = (*src & 0x01) ? 0 : 0x3C;
+            rv = APREQ_CHARSET_UTF8;
+            trail = 5;
+        }
+        else {
+            rv = APREQ_CHARSET_UTF8;
+        }
     }
-    return 1;
-}
 
-/* look for chars between 0x80 and 0x9F, inclusive */
-static APR_INLINE unsigned has_cntrl(const unsigned char *start,
-                                     const unsigned char *end)
-{
-    while (start <= end)
-        if ((*start++ & 0xE0) == 0x80)
-            return 1;
-    return 0;
+    return trail ? saw_cntrl ?
+        APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1 : rv;
 }
 
 
-
 static APR_INLINE apr_uint16_t hex4_to_bmp(const char *what) {
     register apr_uint16_t digit = 0;
 
@@ -312,20 +346,7 @@
 }
 
 
-/*
- * Charset divination heuristics:
- * 1) presume ascii; if not, then
- * 2) presume utf8; if not, then
- * 3) presume latin1; unless there are control chars, in which case
- * 4) punt to cp1252.
- *
- * Note: in downgrading from 2 to 3, we need to be careful
- * about earlier control characters presumed to be valid utf8.
- */
-
-
 static apr_status_t url_decode(char *dest, apr_size_t *dlen,
-                               apreq_charset_t *charset,
                                const char *src, apr_size_t *slen)
 {
     register const char *s = src;
@@ -343,185 +364,8 @@
         case '%':
            if (s + 2 < end && apr_isxdigit(s[1]) && apr_isxdigit(s[2]))
             {
-                unsigned char c;
-               c = hex2_to_char(s + 1);
+                *d = hex2_to_char(s + 1);
                 s += 2;
-                if (c < 0x80 || *charset == APREQ_CHARSET_CP1252)
-                {
-                    *d = c;
-                }
-                else if (c < 0xA0) {
-                    /* these are ctrl chars in latin1 */
-                    *charset = APREQ_CHARSET_CP1252;
-                    *d = c;
-                }
-                else if (c < 0xC0) {
-                    *charset = APREQ_CHARSET_LATIN1;
-                    *d = c;
-                }
-                else if (*charset == APREQ_CHARSET_LATIN1) {
-                    *d = c;
-                }
-
-                /* utf8 cases */
-
-                else if (c < 0xE0) {
-                    /* 2-byte utf8 */
-                    if (s + 3 >= end) {
-                        if (is_enc8_fragment(s+1, end)) {
-                            *charset = APREQ_CHARSET_UTF8;
-                            s -= 2;
-                            *dlen = d - start;
-                            *slen = s - src;
-                            memmove(d, s, end - s);
-                            d[end - s] = 0;
-                            return APR_INCOMPLETE;
-                        }
-                        *d = c;
-                        *charset = has_cntrl((unsigned char *)dest, d)
-                            ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
-                    }
-                    else if (is_enc8(s+1, 1)) {
-                        *charset = APREQ_CHARSET_UTF8;
-                        *d++ = c;
-                        *d   = hex2_to_char(s+2);
-                        s += 3;
-                    }
-                    else {
-                        *d = c;
-                        *charset = has_cntrl((unsigned char *)dest, d)
-                            ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
-                    }
-                }
-                else if (c < 0xF0) {
-                    /* 3-byte utf8 */
-                    if (s + 6 >= end) {
-                        if (is_enc8_fragment(s+1, end)) {
-                            *charset = APREQ_CHARSET_UTF8;
-                            s -= 2;
-                            *dlen = d - start;
-                            *slen = s - src;
-                            memmove(d, s, end - s);
-                            d[end - s] = 0;
-                            return APR_INCOMPLETE;
-                        }
-                        *d = c;
-                        *charset = has_cntrl((unsigned char *)dest, d)
-                            ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
-                    }
-                    else if (is_enc8(s+1, 2)) {
-                        *charset = APREQ_CHARSET_UTF8;
-                        *d++ = c;
-                        *d++ = hex2_to_char(s+2);
-                        *d   = hex2_to_char(s+5);
-                        s += 6;
-                    }
-                    else {
-                        *d = c;
-                        *charset = has_cntrl((unsigned char *)dest, d)
-                            ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
-                    }
-
-                }
-                else if (c < 0xF8) {
-                    /* 4-byte utf8 */
-                    if (s + 9 >= end) {
-                        if (is_enc8_fragment(s+1, end)) {
-                            *charset = APREQ_CHARSET_UTF8;
-                            s -= 2;
-                            *dlen = d - start;
-                            *slen = s - src;
-                            memmove(d, s, end - s);
-                            d[end - s] = 0;
-                            return APR_INCOMPLETE;
-                        }
-                        *d = c;
-                        *charset = has_cntrl((unsigned char *)dest, d)
-                            ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
-                    }
-                    else if (is_enc8(s+1, 3)) {
-                        *charset = APREQ_CHARSET_UTF8;
-                        *d++ = c;
-                        *d++ = hex2_to_char(s+2);
-                        *d++ = hex2_to_char(s+5);
-                        *d   = hex2_to_char(s+8);
-                        s += 9;
-                    }
-                    else {
-                        *d = c;
-                        *charset = has_cntrl((unsigned char *)dest, d)
-                            ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
-                    }
-
-                }
-                else if (c < 0xFC) {
-                    /* 5-byte utf8 */
-                    if (s + 12 >= end) {
-                        if (is_enc8_fragment(s+1, end)) {
-                            *charset = APREQ_CHARSET_UTF8;
-                            s -= 2;
-                            *dlen = d - start;
-                            *slen = s - src;
-                            memmove(d, s, end - s);
-                            d[end - s] = 0;
-                            return APR_INCOMPLETE;
-                        }
-                        *d = c;
-                        *charset = has_cntrl((unsigned char *)dest, d)
-                            ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
-                    }
-                    else if (is_enc8(s+1, 4)) {
-                        *charset = APREQ_CHARSET_UTF8;
-                        *d++ = c;
-                        *d++ = hex2_to_char(s+2);
-                        *d++ = hex2_to_char(s+5);
-                        *d++ = hex2_to_char(s+8);
-                        *d   = hex2_to_char(s+11);
-                        s += 12;
-                    }
-                    else {
-                        *d = c;
-                        *charset = has_cntrl((unsigned char *)dest, d)
-                            ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
-                    }
-
-                }
-                else if (c < 0xFE) {
-                    /* 6-byte utf8 */
-                    if (s + 15 >= end) {
-                        if (is_enc8_fragment(s+1, end)) {
-                            *charset = APREQ_CHARSET_UTF8;
-                            s -= 2;
-                            *dlen = d - start;
-                            *slen = s - src;
-                            memmove(d, s, end - s);
-                            d[end - s] = 0;
-                            return APR_INCOMPLETE;
-                        }
-                        *d = c;
-                        *charset = has_cntrl((unsigned char *)dest, d)
-                            ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
-                    }
-                    else if (is_enc8(s+1, 5)) {
-                        *charset = APREQ_CHARSET_UTF8;
-                        *d++ = c;
-                        *d++ = hex2_to_char(s+2);
-                        *d++ = hex2_to_char(s+5);
-                        *d++ = hex2_to_char(s+8);
-                        *d++ = hex2_to_char(s+11);
-                        *d   = hex2_to_char(s+14);
-                        s += 15;
-                    }
-                    else {
-                        *d = c;
-                        *charset = has_cntrl((unsigned char *)dest, d)
-                            ? APREQ_CHARSET_CP1252 : APREQ_CHARSET_LATIN1;
-                    }
-                }
-                else {
-                    /* (skipped) utf8 byte-order mark */
-                    *charset = APREQ_CHARSET_UTF8;
-                }
            }
             else if (s + 5 < end && (s[1] == 'u' || s[1] == 'U') &&
                      apr_isxdigit(s[2]) && apr_isxdigit(s[3]) &&
@@ -529,20 +373,6 @@
             {
                 apr_uint16_t c = hex4_to_bmp(s+2);
 
-                switch (*charset) {
-                case APREQ_CHARSET_ASCII:
-                    *charset = APREQ_CHARSET_UTF8;
-                case APREQ_CHARSET_UTF8:
-                    break;
-
-                default:
-                    *dlen = d - start;
-                    *slen = s - src;
-                    *d = 0;
-                    return APREQ_ERROR_BADSEQ;
-                }
-
-
                 if (c < 0x80) {
                     *d = c;
                 }
@@ -601,7 +431,6 @@
     apr_size_t len = 0;
     const char *end = s + slen;
     apr_status_t rv;
-    apreq_charset_t c = APREQ_CHARSET_ASCII;
 
     if (s == (const char *)d) {     /* optimize for src = dest case */
         for ( ; d < end; ++d) {
@@ -617,26 +446,15 @@
         slen -= len;
     }
 
-    rv = url_decode(d, dlen, &c, s, &slen);
-
-    if (rv == APR_INCOMPLETE && c == APREQ_CHARSET_UTF8) {
-        c = APREQ_CHARSET_LATIN1;
-        len += *dlen;
-        d += *dlen;
-        slen = end - (s + slen);
-        rv = url_decode(d, dlen, &c, d, &slen);
-    }
-
-    *dlen += len;
-
-    return rv + c;
+    rv = url_decode(d, dlen, s, &slen);
+    return rv + apreq_charset_divine((unsigned char *)d, *dlen);
 }
 
 APREQ_DECLARE(apr_status_t) apreq_decodev(char *d, apr_size_t *dlen,
                                           struct iovec *v, int nelts)
 {
     apr_status_t status = APR_SUCCESS;
-    apreq_charset_t c = APREQ_CHARSET_ASCII;
+    const unsigned char *dest = (unsigned char *)d;
     int n = 0;
 
     *dlen = 0;
@@ -645,7 +463,7 @@
         apr_size_t slen, len;
 
         slen = v[n].iov_len;
-        switch (status = url_decode(d,&len, &c, v[n].iov_base, &slen)) {
+        switch (status = url_decode(d, &len, v[n].iov_base, &slen)) {
 
         case APR_SUCCESS:
             d += len;
@@ -659,12 +477,7 @@
             slen = v[n].iov_len - slen;
 
             if (++n == nelts) {
-                if (c == APREQ_CHARSET_UTF8) {
-                    c = APREQ_CHARSET_LATIN1;
-                    status = url_decode(d, &len, &c, d, &slen);
-                    *dlen += len;
-                }
-                return status + c;
+                return status;
             }
             memcpy(d + slen, v[n].iov_base, v[n].iov_len);
             v[n].iov_len += slen;
@@ -677,7 +490,7 @@
         }
     }
 
-    return status + c;
+    return status + apreq_charset_divine(dest, *dlen);
 }
 
 


Reply via email to