Author: stefan2
Date: Thu Sep 6 21:30:40 2012
New Revision: 1381766
URL: http://svn.apache.org/viewvc?rev=1381766&view=rev
Log:
Many strings we need to convert to UTF-8 (paths, even log messages)
contain large ASCII chars only sections. Add utility functions
to very efficiently skip those sections at the begin of the strings.
* subversion/libsvn_subr/utf_validate.c
(first_non_ascii_char,
first_non_ascii_char_cstring): new utility functions
(svn_utf__last_valid,
svn_utf__cstring_is_valid,
svn_utf__is_valid,
svn_utf__last_valid2): use the new functions to quickly
skip ASCII-only sections at the head of the strings
Modified:
subversion/trunk/subversion/libsvn_subr/utf_validate.c
Modified: subversion/trunk/subversion/libsvn_subr/utf_validate.c
URL:
http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/utf_validate.c?rev=1381766&r1=1381765&r2=1381766&view=diff
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf_validate.c (original)
+++ subversion/trunk/subversion/libsvn_subr/utf_validate.c Thu Sep 6 21:30:40
2012
@@ -57,6 +57,7 @@
*/
#include "private/svn_utf_private.h"
+#include "private/svn_eol_private.h"
/* Lookup table to categorise each octet in the string. */
static const char octet_category[256] = {
@@ -249,12 +250,90 @@ static const char machine [9][14] = {
FSM_ERROR}, /* 0xf5-0xff */
};
+/* Scan MAX_LEN bytes in *DATA for non-ASCII chars. Return the position
+ * of the first non-ASCII char or DATA + MAX_LEN if all were ASCII.
+ */
+static const char *
+first_non_ascii_char(const char *data, apr_size_t max_len)
+{
+#if !SVN_UNALIGNED_ACCESS_IS_OK
+
+ /* On some systems, we need to make sure that buf is properly aligned
+ * for chunky data access.
+ */
+ if ((apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1))
+ {
+ apr_size_t len = (~(apr_uintptr_t)data) & (sizeof(apr_uintptr_t)-1);
+ if (len > max_len)
+ len = max_len;
+ max_len -= len;
+
+ for (; len > 0; ++data, --len)
+ if (*data < 0)
+ return data;
+ }
+
+#endif
+
+ /* Scan the input one machine word at a time. */
+ for (; max_len > sizeof(apr_uintptr_t)
+ ; data += sizeof(apr_uintptr_t), max_len -= sizeof(apr_uintptr_t))
+ if (*(const apr_uintptr_t *)data & SVN__BIT_7_SET)
+ break;
+
+ /* The remaining odd bytes will be examined the naive way: */
+ for (; max_len > 0; ++data, --max_len)
+ if (*data < 0)
+ return data;
+
+ return data;
+}
+
+/* Scan the C string in *DATA for non-ASCII chars. Return the position
+ * of either the first non-ASCII char or the terminating NUL.
+ */
+static const char *
+first_non_ascii_char_cstring(const char *data)
+{
+ /* We need to make sure that BUF is properly aligned for chunky data
+ * access because we don't know the string's length. Unaligned chunk
+ * read access beyond the NUL terminator could therefore result in a
+ * segfault.
+ */
+ for (; (apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1); ++data)
+ if (*data <= 0)
+ return data;
+
+ /* Scan the input one machine word at a time. */
+ for (; ; data += sizeof(apr_uintptr_t))
+ {
+ /* Check for non-ASCII chars: */
+ apr_uintptr_t chunk = *(const apr_uintptr_t *)data;
+ if (chunk & SVN__BIT_7_SET)
+ break;
+
+ /* This is the well-known strlen test: */
+ chunk |= (chunk & SVN__LOWER_7BITS_SET) + SVN__LOWER_7BITS_SET;
+ if ((chunk & SVN__BIT_7_SET) != SVN__BIT_7_SET)
+ break;
+ }
+
+ /* The remaining odd bytes will be examined the naive way: */
+ for (; ; ++data)
+ if (*data <= 0)
+ return data;
+
+ return data;
+}
const char *
svn_utf__last_valid(const char *data, apr_size_t len)
{
- const char *start = data, *end = data + len;
+ const char *start = first_non_ascii_char(data, len);
+ const char *end = data + len;
int state = FSM_START;
+
+ data = start;
while (data < end)
{
unsigned char octet = *data++;
@@ -270,6 +349,8 @@ svn_boolean_t
svn_utf__cstring_is_valid(const char *data)
{
int state = FSM_START;
+ data = first_non_ascii_char_cstring(data);
+
while (*data)
{
unsigned char octet = *data++;
@@ -284,6 +365,8 @@ svn_utf__is_valid(const char *data, apr_
{
const char *end = data + len;
int state = FSM_START;
+ data = first_non_ascii_char(data, len);
+
while (data < end)
{
unsigned char octet = *data++;
@@ -296,8 +379,11 @@ svn_utf__is_valid(const char *data, apr_
const char *
svn_utf__last_valid2(const char *data, apr_size_t len)
{
- const char *start = data, *end = data + len;
+ const char *start = first_non_ascii_char(data, len);
+ const char *end = data + len;
int state = FSM_START;
+
+ data = start;
while (data < end)
{
unsigned char octet = *data++;