Author: brane
Date: Thu Aug  8 11:08:33 2013
New Revision: 1511695

URL: http://svn.apache.org/r1511695
Log:
Added a private function that checks if a UTF-8 string is normalized.

* subversion/include/private/svn_utf_private.h (svn_utf__is_normalized): New.
* subversion/libsvn_subr/utf8proc.c (normalize_cstring): New.
  (svn_utf__is_normalized): Implement, using normalize_cstring.

* subversion/tests/libsvn_subr/utf-test.c
  (test_utf_is_normalized): New test for svn_utf__is_normalized.
  (test_funcs): Added test_utf_is_normalized.

Modified:
    subversion/trunk/subversion/include/private/svn_utf_private.h
    subversion/trunk/subversion/libsvn_subr/utf8proc.c
    subversion/trunk/subversion/tests/libsvn_subr/utf-test.c

Modified: subversion/trunk/subversion/include/private/svn_utf_private.h
URL: 
http://svn.apache.org/viewvc/subversion/trunk/subversion/include/private/svn_utf_private.h?rev=1511695&r1=1511694&r2=1511695&view=diff
==============================================================================
--- subversion/trunk/subversion/include/private/svn_utf_private.h (original)
+++ subversion/trunk/subversion/include/private/svn_utf_private.h Thu Aug  8 
11:08:33 2013
@@ -112,6 +112,14 @@ svn_utf__normcmp(int *result,
                  const char *str2, apr_size_t len2,
                  svn_membuf_t *buf1, svn_membuf_t *buf2);
 
+/* Check if STRING is a valid, NFC-normalized UTF-8 string.  Note that
+ * a FALSE return value may indicate that STRING is not valid UTF-8 at
+ * all.
+ *
+ * Use SCRATCH_POOL for temporary allocations.
+ */
+svn_boolean_t
+svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool);
 
 /* Pattern matching similar to the the SQLite LIKE and GLOB
  * operators. PATTERN, KEY and ESCAPE must all point to UTF-8

Modified: subversion/trunk/subversion/libsvn_subr/utf8proc.c
URL: 
http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/utf8proc.c?rev=1511695&r1=1511694&r2=1511695&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf8proc.c (original)
+++ subversion/trunk/subversion/libsvn_subr/utf8proc.c Thu Aug  8 11:08:33 2013
@@ -105,6 +105,34 @@ decompose_normalized(apr_size_t *result_
   return SVN_NO_ERROR;
 }
 
+/* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8
+ * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
+ * NUL-terminated; otherwise look only at the first LENGTH bytes in
+ * STRING. Upon return, BUFFER->data points at a NUL-terminated string
+ * of UTF-8 characters.
+ *
+ * A returned error may indicate that STRING contains invalid UTF-8 or
+ * invalid Unicode codepoints. Any error message comes from utf8proc.
+ */
+static svn_error_t *
+normalize_cstring(apr_size_t *result_length,
+                  const char *string, apr_size_t length,
+                  svn_membuf_t *buffer)
+{
+  ssize_t result = unicode_decomposition(0, string, length, buffer);
+  if (result >= 0)
+    {
+      svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
+      result = utf8proc_reencode(buffer->data, result,
+                                 UTF8PROC_COMPOSE | UTF8PROC_STABLE);
+    }
+  if (result < 0)
+    return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
+                            gettext(utf8proc_errmsg(result)));
+  *result_length = result;
+  return SVN_NO_ERROR;
+}
+
 /* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of
  * length LENB. Return 0 if they're equal, a negative value if BUFA is
  * less than BUFB, otherwise a positive value.
@@ -305,6 +333,22 @@ svn_utf__glob(svn_boolean_t *match,
   return SVN_NO_ERROR;
 }
 
+svn_boolean_t
+svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool)
+{
+  svn_error_t *err;
+  svn_membuf_t buffer;
+  apr_size_t result_length;
+  const apr_size_t length = strlen(string);
+  svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
+  err = normalize_cstring(&result_length, string, length, &buffer);
+  if (err)
+    {
+      svn_error_clear(err);
+      return FALSE;
+    }
+  return (length == result_length && 0 == strcmp(string, buffer.data));
+}
 
 const char *
 svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool)

Modified: subversion/trunk/subversion/tests/libsvn_subr/utf-test.c
URL: 
http://svn.apache.org/viewvc/subversion/trunk/subversion/tests/libsvn_subr/utf-test.c?rev=1511695&r1=1511694&r2=1511695&view=diff
==============================================================================
--- subversion/trunk/subversion/tests/libsvn_subr/utf-test.c (original)
+++ subversion/trunk/subversion/tests/libsvn_subr/utf-test.c Thu Aug  8 
11:08:33 2013
@@ -674,6 +674,69 @@ test_utf_fuzzy_escape(apr_pool_t *pool)
   return SVN_NO_ERROR;
 }
 
+static svn_error_t *
+test_utf_is_normalized(apr_pool_t *pool)
+{
+  /* Normalized: NFC */
+  static const char nfc[] =
+    "\xe1\xb9\xa8"              /* S with dot above and below */
+    "\xc5\xaf"                  /* u with ring */
+    "\xe1\xb8\x87"              /* b with macron below */
+    "\xe1\xb9\xbd"              /* v with tilde */
+    "\xe1\xb8\x9d"              /* e with breve and cedilla */
+    "\xc8\x91"                  /* r with double grave */
+    "\xc5\xa1"                  /* s with caron */
+    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
+    "\xe1\xbb\x9d"              /* o with grave and hook */
+    "\xe1\xb9\x8b";             /* n with circumflex below */
+
+  /* Normalized: NFD */
+  static const char nfd[] =
+    "S\xcc\xa3\xcc\x87"         /* S with dot above and below */
+    "u\xcc\x8a"                 /* u with ring */
+    "b\xcc\xb1"                 /* b with macron below */
+    "v\xcc\x83"                 /* v with tilde */
+    "e\xcc\xa7\xcc\x86"         /* e with breve and cedilla */
+    "r\xcc\x8f"                 /* r with double grave */
+    "s\xcc\x8c"                 /* s with caron */
+    "i\xcc\x88\xcc\x81"         /* i with diaeresis and acute */
+    "o\xcc\x9b\xcc\x80"         /* o with grave and hook */
+    "n\xcc\xad";                /* n with circumflex below */
+
+  /* Mixed, denormalized */
+  static const char mixup[] =
+    "S\xcc\x87\xcc\xa3"         /* S with dot above and below */
+    "\xc5\xaf"                  /* u with ring */
+    "b\xcc\xb1"                 /* b with macron below */
+    "\xe1\xb9\xbd"              /* v with tilde */
+    "e\xcc\xa7\xcc\x86"         /* e with breve and cedilla */
+    "\xc8\x91"                  /* r with double grave */
+    "s\xcc\x8c"                 /* s with caron */
+    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
+    "o\xcc\x80\xcc\x9b"         /* o with grave and hook */
+    "\xe1\xb9\x8b";             /* n with circumflex below */
+
+  /* Invalid UTF-8 */
+  static const char invalid[] =
+    "\xe1\xb9\xa8"              /* S with dot above and below */
+    "\xc5\xaf"                  /* u with ring */
+    "\xe1\xb8\x87"              /* b with macron below */
+    "\xe1\xb9\xbd"              /* v with tilde */
+    "\xe1\xb8\x9d"              /* e with breve and cedilla */
+    "\xc8\x91"                  /* r with double grave */
+    "\xc5\xa1"                  /* s with caron */
+    "\xe1\xb8\xaf"              /* i with diaeresis and acute */
+    "\xe6"                      /* Invalid byte */
+    "\xe1\xb9\x8b";             /* n with circumflex below */
+
+  SVN_ERR_ASSERT(svn_utf__is_normalized(nfc, pool));
+  SVN_ERR_ASSERT(!svn_utf__is_normalized(nfd, pool));
+  SVN_ERR_ASSERT(!svn_utf__is_normalized(mixup, pool));
+  SVN_ERR_ASSERT(!svn_utf__is_normalized(invalid, pool));
+
+  return SVN_NO_ERROR;
+}
+
 
 /* The test table.  */
 
@@ -694,5 +757,7 @@ struct svn_test_descriptor_t test_funcs[
                    "test svn_utf__glob"),
     SVN_TEST_PASS2(test_utf_fuzzy_escape,
                    "test svn_utf__fuzzy_escape"),
+    SVN_TEST_PASS2(test_utf_is_normalized,
+                   "test svn_utf__is_normalized"),
     SVN_TEST_NULL
   };


Reply via email to