On Thu, Sep 6, 2012 at 11:30 PM, <stef...@apache.org> wrote: > Author: stefan2 > Date: Thu Sep 6 21:30:40 2012 > New Revision: 1381766 > > URL: http://svn.apache.org/viewvc?rev=1381766&view=rev > Log: > Many strings we need to convert to UTF-8 (paths, even log messages) > contain large ASCII chars only sections. Add utility functions > to very efficiently skip those sections at the begin of the strings. > > * subversion/libsvn_subr/utf_validate.c > (first_non_ascii_char, > first_non_ascii_char_cstring): new utility functions > (svn_utf__last_valid, > svn_utf__cstring_is_valid, > svn_utf__is_valid, > svn_utf__last_valid2): use the new functions to quickly > skip ASCII-only sections at the head of the strings > > > Modified: > subversion/trunk/subversion/libsvn_subr/utf_validate.c > > Modified: subversion/trunk/subversion/libsvn_subr/utf_validate.c > URL: > http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/utf_validate.c?rev=1381766&r1=1381765&r2=1381766&view=diff > ============================================================================== > --- subversion/trunk/subversion/libsvn_subr/utf_validate.c (original) > +++ subversion/trunk/subversion/libsvn_subr/utf_validate.c Thu Sep 6 > 21:30:40 2012 > @@ -57,6 +57,7 @@ > */ > > #include "private/svn_utf_private.h" > +#include "private/svn_eol_private.h" > > /* Lookup table to categorise each octet in the string. */ > static const char octet_category[256] = { > @@ -249,12 +250,90 @@ static const char machine [9][14] = { > FSM_ERROR}, /* 0xf5-0xff */ > }; > > +/* Scan MAX_LEN bytes in *DATA for non-ASCII chars. Return the position > + * of the first non-ASCII char or DATA + MAX_LEN if all were ASCII. > + */ > +static const char * > +first_non_ascii_char(const char *data, apr_size_t max_len) > +{ > +#if !SVN_UNALIGNED_ACCESS_IS_OK > + > + /* On some systems, we need to make sure that buf is properly aligned > + * for chunky data access. > + */ > + if ((apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1)) > + { > + apr_size_t len = (~(apr_uintptr_t)data) & (sizeof(apr_uintptr_t)-1); > + if (len > max_len) > + len = max_len; > + max_len -= len; > + > + for (; len > 0; ++data, --len) > + if (*data < 0) > + return data; > + } > + > +#endif > + > + /* Scan the input one machine word at a time. */ > + for (; max_len > sizeof(apr_uintptr_t) > + ; data += sizeof(apr_uintptr_t), max_len -= sizeof(apr_uintptr_t)) > + if (*(const apr_uintptr_t *)data & SVN__BIT_7_SET) > + break; > + > + /* The remaining odd bytes will be examined the naive way: */ > + for (; max_len > 0; ++data, --max_len) > + if (*data < 0) > + return data; > + > + return data; > +} > + > +/* Scan the C string in *DATA for non-ASCII chars. Return the position > + * of either the first non-ASCII char or the terminating NUL. > + */ > +static const char * > +first_non_ascii_char_cstring(const char *data) > +{ > + /* We need to make sure that BUF is properly aligned for chunky data > + * access because we don't know the string's length. Unaligned chunk > + * read access beyond the NUL terminator could therefore result in a > + * segfault. > + */ > + for (; (apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1); ++data) > + if (*data <= 0) > + return data; > + > + /* Scan the input one machine word at a time. */ > + for (; ; data += sizeof(apr_uintptr_t)) > + { > + /* Check for non-ASCII chars: */ > + apr_uintptr_t chunk = *(const apr_uintptr_t *)data; > + if (chunk & SVN__BIT_7_SET) > + break; > + > + /* This is the well-known strlen test: */ > + chunk |= (chunk & SVN__LOWER_7BITS_SET) + SVN__LOWER_7BITS_SET; > + if ((chunk & SVN__BIT_7_SET) != SVN__BIT_7_SET) > + break; > + } > + > + /* The remaining odd bytes will be examined the naive way: */ > + for (; ; ++data) > + if (*data <= 0) > + return data; > + > + return data;
I get a compiler warning here (VS 2010): c:\research\svn\client_build\trunk2\subversion\libsvn_subr\utf_validate.c(328): warning C4702: unreachable code It seems that last "return data;" is superfluous, since the loop above it can only end by returning? -- Johan