utf_validate.c

Johan Corveleyn Sun, 09 Sep 2012 15:59:18 -0700

On Thu, Sep 6, 2012 at 11:30 PM,  <stef...@apache.org> wrote:
> Author: stefan2
> Date: Thu Sep  6 21:30:40 2012
> New Revision: 1381766
>
> URL: http://svn.apache.org/viewvc?rev=1381766&view=rev
> Log:
> Many strings we need to convert to UTF-8 (paths, even log messages)
> contain large ASCII chars only sections.  Add utility functions
> to very efficiently skip those sections at the begin of the strings.
>
> * subversion/libsvn_subr/utf_validate.c
>   (first_non_ascii_char,
>    first_non_ascii_char_cstring): new utility functions
>   (svn_utf__last_valid,
>    svn_utf__cstring_is_valid,
>    svn_utf__is_valid,
>    svn_utf__last_valid2): use the new functions to quickly
>    skip ASCII-only sections at the head of the strings
>
>
> Modified:
>     subversion/trunk/subversion/libsvn_subr/utf_validate.c
>
> Modified: subversion/trunk/subversion/libsvn_subr/utf_validate.c
> URL: 
> http://svn.apache.org/viewvc/subversion/trunk/subversion/libsvn_subr/utf_validate.c?rev=1381766&r1=1381765&r2=1381766&view=diff
> ==============================================================================
> --- subversion/trunk/subversion/libsvn_subr/utf_validate.c (original)
> +++ subversion/trunk/subversion/libsvn_subr/utf_validate.c Thu Sep  6 
> 21:30:40 2012
> @@ -57,6 +57,7 @@
>   */
>
>  #include "private/svn_utf_private.h"
> +#include "private/svn_eol_private.h"
>
>  /* Lookup table to categorise each octet in the string. */
>  static const char octet_category[256] = {
> @@ -249,12 +250,90 @@ static const char machine [9][14] = {
>     FSM_ERROR},        /* 0xf5-0xff */
>  };
>
> +/* Scan MAX_LEN bytes in *DATA for non-ASCII chars. Return the position
> + * of the first non-ASCII char or DATA + MAX_LEN if all were ASCII.
> + */
> +static const char *
> +first_non_ascii_char(const char *data, apr_size_t max_len)
> +{
> +#if !SVN_UNALIGNED_ACCESS_IS_OK
> +
> +  /* On some systems, we need to make sure that buf is properly aligned
> +   * for chunky data access.
> +   */
> +  if ((apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1))
> +    {
> +      apr_size_t len = (~(apr_uintptr_t)data) & (sizeof(apr_uintptr_t)-1);
> +      if (len > max_len)
> +        len = max_len;
> +      max_len -= len;
> +
> +      for (; len > 0; ++data, --len)
> +          if (*data < 0)
> +            return data;
> +    }
> +
> +#endif
> +
> +  /* Scan the input one machine word at a time. */
> +  for (; max_len > sizeof(apr_uintptr_t)
> +       ; data += sizeof(apr_uintptr_t), max_len -= sizeof(apr_uintptr_t))
> +    if (*(const apr_uintptr_t *)data & SVN__BIT_7_SET)
> +      break;
> +
> +  /* The remaining odd bytes will be examined the naive way: */
> +  for (; max_len > 0; ++data, --max_len)
> +    if (*data < 0)
> +      return data;
> +
> +  return data;
> +}
> +
> +/* Scan the C string in *DATA for non-ASCII chars. Return the position
> + * of either the first non-ASCII char or the terminating NUL.
> + */
> +static const char *
> +first_non_ascii_char_cstring(const char *data)
> +{
> +  /* We need to make sure that BUF is properly aligned for chunky data
> +   * access because we don't know the string's length. Unaligned chunk
> +   * read access beyond the NUL terminator could therefore result in a
> +   * segfault.
> +   */
> +  for (; (apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1); ++data)
> +    if (*data <= 0)
> +      return data;
> +
> +  /* Scan the input one machine word at a time. */
> +  for (; ; data += sizeof(apr_uintptr_t))
> +    {
> +      /* Check for non-ASCII chars: */
> +      apr_uintptr_t chunk = *(const apr_uintptr_t *)data;
> +      if (chunk & SVN__BIT_7_SET)
> +        break;
> +
> +      /* This is the well-known strlen test: */
> +      chunk |= (chunk & SVN__LOWER_7BITS_SET) + SVN__LOWER_7BITS_SET;
> +      if ((chunk & SVN__BIT_7_SET) != SVN__BIT_7_SET)
> +        break;
> +    }
> +
> +  /* The remaining odd bytes will be examined the naive way: */
> +  for (; ; ++data)
> +    if (*data <= 0)
> +      return data;
> +
> +  return data;


I get a compiler warning here (VS 2010):

c:\research\svn\client_build\trunk2\subversion\libsvn_subr\utf_validate.c(328):
warning C4702: unreachable code

It seems that last "return data;" is superfluous, since the loop above
it can only end by returning?

-- 
Johan

Re: svn commit: r1381766 - /subversion/trunk/subversion/libsvn_subr/utf_validate.c

Reply via email to