On Wed, Mar 13, 2019 at 08:16:05PM +0200, Lauri Tirkkonen wrote:
> Changing worddelimiters to wchar_t * allows to use wcschr() instead of
> implementing utf8strchr().
> 
> Also, since there are so many different spaces and punctuation in
> unicode, I have another diff that uses iswspace()/iswpunct() to figure
> out delimiters, but I expect that to be a bit more controversial than
> this patch so I'll send it separately.
> 
> From cb65b11944147f3ea43b9b19abf068e0dea7a562 Mon Sep 17 00:00:00 2001
> From: Lauri Tirkkonen <[email protected]>
> Date: Wed, 13 Mar 2019 19:40:52 +0200
> Subject: [PATCH] replace utf8strchr with wcschr
> 
> ---
>  config.def.h |  4 ++--
>  st.c         | 20 +-------------------
>  st.h         |  2 +-
>  3 files changed, 4 insertions(+), 22 deletions(-)
> 
> diff --git a/config.def.h b/config.def.h
> index 0e01717..482901e 100644
> --- a/config.def.h
> +++ b/config.def.h
> @@ -30,9 +30,9 @@ static float chscale = 1.0;
>  /*
>   * word delimiter string
>   *
> - * More advanced example: " `'\"()[]{}"
> + * More advanced example: L" `'\"()[]{}"
>   */
> -char *worddelimiters = " ";
> +wchar_t *worddelimiters = L" ";
>  
>  /* selection timeouts (in milliseconds) */
>  static unsigned int doubleclicktimeout = 300;
> diff --git a/st.c b/st.c
> index d35f89d..812f30c 100644
> --- a/st.c
> +++ b/st.c
> @@ -41,7 +41,7 @@
>  #define ISCONTROLC0(c)               (BETWEEN(c, 0, 0x1f) || (c) == '\177')
>  #define ISCONTROLC1(c)               (BETWEEN(c, 0x80, 0x9f))
>  #define ISCONTROL(c)         (ISCONTROLC0(c) || ISCONTROLC1(c))
> -#define ISDELIM(u)           (utf8strchr(worddelimiters, u) != NULL)
> +#define ISDELIM(u)           (u != 0 && wcschr(worddelimiters, u) != NULL)
>  
>  enum term_mode {
>       MODE_WRAP        = 1 << 0,
> @@ -210,7 +210,6 @@ static void selsnap(int *, int *, int);
>  static size_t utf8decode(const char *, Rune *, size_t);
>  static Rune utf8decodebyte(char, size_t *);
>  static char utf8encodebyte(Rune, size_t);
> -static char *utf8strchr(char *, Rune);
>  static size_t utf8validate(Rune *, size_t);
>  
>  static char *base64dec(const char *);
> @@ -337,23 +336,6 @@ utf8encodebyte(Rune u, size_t i)
>       return utfbyte[i] | (u & ~utfmask[i]);
>  }
>  
> -char *
> -utf8strchr(char *s, Rune u)
> -{
> -     Rune r;
> -     size_t i, j, len;
> -
> -     len = strlen(s);
> -     for (i = 0, j = 0; i < len; i += j) {
> -             if (!(j = utf8decode(&s[i], &r, len - i)))
> -                     break;
> -             if (r == u)
> -                     return &(s[i]);
> -     }
> -
> -     return NULL;
> -}
> -
>  size_t
>  utf8validate(Rune *u, size_t i)
>  {
> diff --git a/st.h b/st.h
> index 38c61c4..4da3051 100644
> --- a/st.h
> +++ b/st.h
> @@ -114,7 +114,7 @@ char *xstrdup(char *);
>  extern char *utmp;
>  extern char *stty_args;
>  extern char *vtiden;
> -extern char *worddelimiters;
> +extern wchar_t *worddelimiters;
>  extern int allowaltscreen;
>  extern char *termname;
>  extern unsigned int tabspaces;
> -- 
> 2.20.1
> 
> -- 
> Lauri Tirkkonen | lotheac @ IRCnet
> 

I don't like mixing of the existing functions with wchar_t.
I think st should (at the very least internally) use utf-8.

Won't apply.

-- 
Kind regards,
Hiltjo

Reply via email to