utf_validate.c

stefan2 Fri, 07 Sep 2012 15:59:56 -0700

Author: stefan2
Date: Fri Sep  7 22:59:09 2012
New Revision: 1382204

URL: http://svn.apache.org/viewvc?rev=1382204&view=rev
Log:
Make clear that this is, in fact, the *UTF*_validate file.
So, we are not dealing with true ASCII representations here
but 1-byte encoded UTF-8 codepoints.


Also, support platforms with unsigned chars being default
without giving away any of the efficiency on the others.

* subversion/libsvn_subr/utf_validate.c
  (first_non_ascii_char,
   first_non_ascii_char_cstring): rename to ...
  (first_non_fsm_start_char,
   first_non_fsm_start_char_cstring): ... this; support platforms
   on which chars are not signed by default
  (svn_utf__last_valid,
   svn_utf__cstring_is_valid,
   svn_utf__is_valid,
   svn_utf__last_valid2): update callers

Modified:
    subversion/trunk/subversion/libsvn_subr/utf_validate.c

Modified: subversion/trunk/subversion/libsvn_subr/utf_validate.c
URL: 
http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/utf_validate.c?rev=1382204&r1=1382203&r2=1382204&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf_validate.c (original)
+++ subversion/trunk/subversion/libsvn_subr/utf_validate.c Fri Sep  7 22:59:09 
2012
@@ -250,11 +250,12 @@ static const char machine [9][14] = {
    FSM_ERROR},        /* 0xf5-0xff */
 };
 
-/* Scan MAX_LEN bytes in *DATA for non-ASCII chars. Return the position
- * of the first non-ASCII char or DATA + MAX_LEN if all were ASCII.
+/* Scan MAX_LEN bytes in *DATA for chars that are not in the octet
+ * category 0 (FSM_START).  Return the position of the first such char
+ * or DATA + MAX_LEN if all were cat 0.
  */
 static const char *
-first_non_ascii_char(const char *data, apr_size_t max_len)
+first_non_fsm_start_char(const char *data, apr_size_t max_len)
 {
 #if !SVN_UNALIGNED_ACCESS_IS_OK
 
@@ -269,8 +270,8 @@ first_non_ascii_char(const char *data, a
       max_len -= len;
 
       for (; len > 0; ++data, --len)
-          if (*data < 0)
-            return data;
+        if (*data < 0 || *data >= 0x80)
+          return data;
     }
     
 #endif
@@ -283,17 +284,18 @@ first_non_ascii_char(const char *data, a
 
   /* The remaining odd bytes will be examined the naive way: */
   for (; max_len > 0; ++data, --max_len)
-    if (*data < 0)
+    if (*data < 0 || *data >= 0x80)
       return data;
 
   return data;
 }
 
-/* Scan the C string in *DATA for non-ASCII chars. Return the position
- * of either the first non-ASCII char or the terminating NUL.
+/* Scan the C string in *DATA for chars that are not in the octet
+ * category 0 (FSM_START).  Return the position of either the such
+ * char or of the terminating NUL.
  */
 static const char *
-first_non_ascii_char_cstring(const char *data)
+first_non_fsm_start_char_cstring(const char *data)
 {
   /* We need to make sure that BUF is properly aligned for chunky data
    * access because we don't know the string's length. Unaligned chunk
@@ -301,7 +303,7 @@ first_non_ascii_char_cstring(const char 
    * segfault.
    */
   for (; (apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1); ++data)
-    if (*data <= 0)
+    if (*data <= 0 || *data >= 0x80)
       return data;
 
   /* Scan the input one machine word at a time. */
@@ -320,7 +322,7 @@ first_non_ascii_char_cstring(const char 
 
   /* The remaining odd bytes will be examined the naive way: */
   for (; ; ++data)
-    if (*data <= 0)
+    if (*data <= 0 || *data >= 0x80)
       return data;
 
   return data;
@@ -329,7 +331,7 @@ first_non_ascii_char_cstring(const char 
 const char *
 svn_utf__last_valid(const char *data, apr_size_t len)
 {
-  const char *start = first_non_ascii_char(data, len);
+  const char *start = first_non_fsm_start_char(data, len);
   const char *end = data + len;
   int state = FSM_START;
 
@@ -349,7 +351,7 @@ svn_boolean_t
 svn_utf__cstring_is_valid(const char *data)
 {
   int state = FSM_START;
-  data = first_non_ascii_char_cstring(data);
+  data = first_non_fsm_start_char_cstring(data);
 
   while (*data)
     {
@@ -365,7 +367,7 @@ svn_utf__is_valid(const char *data, apr_
 {
   const char *end = data + len;
   int state = FSM_START;
-  data = first_non_ascii_char(data, len);
+  data = first_non_fsm_start_char(data, len);
 
   while (data < end)
     {
@@ -379,7 +381,7 @@ svn_utf__is_valid(const char *data, apr_
 const char *
 svn_utf__last_valid2(const char *data, apr_size_t len)
 {
-  const char *start = first_non_ascii_char(data, len);
+  const char *start = first_non_fsm_start_char(data, len);
   const char *end = data + len;
   int state = FSM_START;

svn commit: r1382204 - /subversion/trunk/subversion/libsvn_subr/utf_validate.c

Reply via email to