Re: [PATCH v5 02/12] ctype: support iscntrl, ispunct, isxdigit and isprint
On Wed, Oct 17, 2012 at 7:09 PM, "Jan H. Schönherr" wrote: >> const unsigned char sane_ctype[256] = { >> - 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0, /* 0.. 15 */ >> - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16.. 31 */ >> + X, X, X, X, X, X, X, X, X, Z, Z, X, X, Z, X, X, /* 0.. 15 */ >> + X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 16.. 31 */ > > "Normal" isspace() also includes vertical tab (11) and form-feed (12) as > white-space characters. Is there a reason, why they are not included here? I'm not sure. They were not classified as spaces in the very first version in 4546738 (Unlocalized isspace and friends - 2005-10-13). Maybe Linus had a reason to do so. >> +#define isprint(x) (sane_istest(x, GIT_ALPHA | GIT_DIGIT | GIT_SPACE | \ >> + GIT_PUNCT | GIT_REGEX_SPECIAL | GIT_GLOB_SPECIAL | \ >> + GIT_PATHSPEC_MAGIC)) > > "Normal" isprint() only includes space (32) from the white-space characters. > The other white-space characters are not considered printable. > > Do we want to stay close to the "original", or not? We do. I followed [1] but obvious missed the last sentence in "print" description: "No characters specified for the keyword cntrl shall be specified". Thanks for catching. I'll fix it soon. [1] http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap07.html -- Duy -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 02/12] ctype: support iscntrl, ispunct, isxdigit and isprint
Hi Nguyen. I just had a need for isprint() myself, and then I found your code here. I had a look at the POSIX locale as describe here: http://sourceware.org/git/?p=glibc.git;a=blob;f=localedata/locales/POSIX Some remarks below. Am 14.10.2012 16:26, schrieb Nguyen Thai Ngoc Duy: > -- 8< -- > diff --git a/ctype.c b/ctype.c > index faeaf34..0bfebb4 100644 > --- a/ctype.c > +++ b/ctype.c > @@ -11,18 +11,21 @@ enum { > D = GIT_DIGIT, > G = GIT_GLOB_SPECIAL, /* *, ?, [, \\ */ > R = GIT_REGEX_SPECIAL, /* $, (, ), +, ., ^, {, | */ > - P = GIT_PATHSPEC_MAGIC /* other non-alnum, except for ] and } */ > + P = GIT_PATHSPEC_MAGIC, /* other non-alnum, except for ] and } */ > + X = GIT_CNTRL, > + U = GIT_PUNCT, > + Z = GIT_CNTRL | GIT_SPACE > }; > > const unsigned char sane_ctype[256] = { > - 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0, /* 0.. 15 */ > - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16.. 31 */ > + X, X, X, X, X, X, X, X, X, Z, Z, X, X, Z, X, X, /* 0.. 15 */ > + X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 16.. 31 */ "Normal" isspace() also includes vertical tab (11) and form-feed (12) as white-space characters. Is there a reason, why they are not included here? > S, P, P, P, R, P, P, P, R, R, G, R, P, P, R, P, /* 32.. 47 */ > D, D, D, D, D, D, D, D, D, D, P, P, P, P, P, G, /* 48.. 63 */ > P, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, /* 64.. 79 */ > - A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, P, /* 80.. 95 */ > + A, A, A, A, A, A, A, A, A, A, A, G, G, U, R, P, /* 80.. 95 */ > P, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, /* 96..111 */ > - A, A, A, A, A, A, A, A, A, A, A, R, R, 0, P, 0, /* 112..127 */ > + A, A, A, A, A, A, A, A, A, A, A, R, R, U, P, X, /* 112..127 */ > /* Nothing in the 128.. range */ > }; > > diff --git a/git-compat-util.h b/git-compat-util.h > index f8b859c..db77f3e 100644 > --- a/git-compat-util.h > +++ b/git-compat-util.h [...] > @@ -527,6 +533,13 @@ extern const unsigned char sane_ctype[256]; > #define isupper(x) sane_iscase(x, 0) > #define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL) > #define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | > GIT_REGEX_SPECIAL) > +#define iscntrl(x) (sane_istest(x,GIT_CNTRL)) > +#define ispunct(x) sane_istest(x, GIT_PUNCT | GIT_REGEX_SPECIAL | \ > + GIT_GLOB_SPECIAL | GIT_PATHSPEC_MAGIC) > +#define isxdigit(x) (hexval_table[x] != -1) > +#define isprint(x) (sane_istest(x, GIT_ALPHA | GIT_DIGIT | GIT_SPACE | \ > + GIT_PUNCT | GIT_REGEX_SPECIAL | GIT_GLOB_SPECIAL | \ > + GIT_PATHSPEC_MAGIC)) "Normal" isprint() only includes space (32) from the white-space characters. The other white-space characters are not considered printable. Do we want to stay close to the "original", or not? Regards Jan -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 02/12] ctype: support iscntrl, ispunct, isxdigit and isprint
On Sun, Oct 14, 2012 at 03:59:31PM +0200, René Scharfe wrote: > Am 14.10.2012 15:25, schrieb Nguyen Thai Ngoc Duy: > > On Sun, Oct 14, 2012 at 7:59 PM, René Scharfe > > wrote: > >> With that, couldn't you squeeze the other two classes into the existing > >> sane_type? > > > > No there are still conflicts: 9, 10 and 13 as spaces (vs controls) and > > 123, 124 and 126 as regex/pathspec special (vs punctuation). > > That's not a problem, an entry in the table can have more than one bit > set -- just OR them together in ctype.c. It may not look as nice, but > that's OK. You could also define a character for GIT_SPACE | GIT_CNTRL > etc. for cosmetic reasons. Only space chars is not a subset of control chars, which needs a new combination. So the result does not look as bad as I thought: -- 8< -- diff --git a/ctype.c b/ctype.c index faeaf34..0bfebb4 100644 --- a/ctype.c +++ b/ctype.c @@ -11,18 +11,21 @@ enum { D = GIT_DIGIT, G = GIT_GLOB_SPECIAL, /* *, ?, [, \\ */ R = GIT_REGEX_SPECIAL, /* $, (, ), +, ., ^, {, | */ - P = GIT_PATHSPEC_MAGIC /* other non-alnum, except for ] and } */ + P = GIT_PATHSPEC_MAGIC, /* other non-alnum, except for ] and } */ + X = GIT_CNTRL, + U = GIT_PUNCT, + Z = GIT_CNTRL | GIT_SPACE }; const unsigned char sane_ctype[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0, /* 0.. 15 */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16.. 31 */ + X, X, X, X, X, X, X, X, X, Z, Z, X, X, Z, X, X, /* 0.. 15 */ + X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 16.. 31 */ S, P, P, P, R, P, P, P, R, R, G, R, P, P, R, P, /* 32.. 47 */ D, D, D, D, D, D, D, D, D, D, P, P, P, P, P, G, /* 48.. 63 */ P, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, /* 64.. 79 */ - A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, P, /* 80.. 95 */ + A, A, A, A, A, A, A, A, A, A, A, G, G, U, R, P, /* 80.. 95 */ P, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, /* 96..111 */ - A, A, A, A, A, A, A, A, A, A, A, R, R, 0, P, 0, /* 112..127 */ + A, A, A, A, A, A, A, A, A, A, A, R, R, U, P, X, /* 112..127 */ /* Nothing in the 128.. range */ }; diff --git a/git-compat-util.h b/git-compat-util.h index f8b859c..db77f3e 100644 --- a/git-compat-util.h +++ b/git-compat-util.h @@ -510,6 +510,10 @@ extern const char tolower_trans_tbl[256]; #undef isupper #undef tolower #undef toupper +#undef iscntrl +#undef ispunct +#undef isxdigit +#undef isprint extern const unsigned char sane_ctype[256]; #define GIT_SPACE 0x01 #define GIT_DIGIT 0x02 @@ -517,6 +521,8 @@ extern const unsigned char sane_ctype[256]; #define GIT_GLOB_SPECIAL 0x08 #define GIT_REGEX_SPECIAL 0x10 #define GIT_PATHSPEC_MAGIC 0x20 +#define GIT_CNTRL 0x40 +#define GIT_PUNCT 0x80 #define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0) #define isascii(x) (((x) & ~0x7f) == 0) #define isspace(x) sane_istest(x,GIT_SPACE) @@ -527,6 +533,13 @@ extern const unsigned char sane_ctype[256]; #define isupper(x) sane_iscase(x, 0) #define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL) #define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL) +#define iscntrl(x) (sane_istest(x,GIT_CNTRL)) +#define ispunct(x) sane_istest(x, GIT_PUNCT | GIT_REGEX_SPECIAL | \ + GIT_GLOB_SPECIAL | GIT_PATHSPEC_MAGIC) +#define isxdigit(x) (hexval_table[x] != -1) +#define isprint(x) (sane_istest(x, GIT_ALPHA | GIT_DIGIT | GIT_SPACE | \ + GIT_PUNCT | GIT_REGEX_SPECIAL | GIT_GLOB_SPECIAL | \ + GIT_PATHSPEC_MAGIC)) #define tolower(x) sane_case((unsigned char)(x), 0x20) #define toupper(x) sane_case((unsigned char)(x), 0) #define is_pathspec_magic(x) sane_istest(x,GIT_PATHSPEC_MAGIC) -- 8< -- -- Duy -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 02/12] ctype: support iscntrl, ispunct, isxdigit and isprint
Am 14.10.2012 15:25, schrieb Nguyen Thai Ngoc Duy: On Sun, Oct 14, 2012 at 7:59 PM, René Scharfe wrote: With that, couldn't you squeeze the other two classes into the existing sane_type? No there are still conflicts: 9, 10 and 13 as spaces (vs controls) and 123, 124 and 126 as regex/pathspec special (vs punctuation). That's not a problem, an entry in the table can have more than one bit set -- just OR them together in ctype.c. It may not look as nice, but that's OK. You could also define a character for GIT_SPACE | GIT_CNTRL etc. for cosmetic reasons. René -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 02/12] ctype: support iscntrl, ispunct, isxdigit and isprint
On Sun, Oct 14, 2012 at 7:59 PM, René Scharfe wrote: >> +const unsigned char sane_ctype2[256] = { >> + CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, /* >> 0..15 */ >> + CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, /* >> 16..31 */ >> + 0, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, /* >> 32..47 */ >> + XD, XD, XD, XD, XD, XD, XD, XD, XD, XD, PU, PU, PU, PU, PU, PU, /* >> 48..63 */ >> + PU, 0, XD, 0, XD, 0, XD, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* >> 64..79 */ >> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PU, PU, PU, PU, PU, /* >> 80..95 */ >> + PU, 0, XD, 0, XD, 0, XD, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* >> 96..111 */ >> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PU, PU, PU, PU, CN, /* >> 112..127 */ > > > Shouldn't [ace] (65, 67, 69) and [ACE] (97, 99, 101) be xdigits as well? Hmm.. I generated it from LANG=C. I wonder where I got it wrong.. > But how about using the existing hexval_table instead, like this: > > #define isxdigit(x) (hexval_table[(x)] != -1) > > With that, couldn't you squeeze the other two classes into the existing > sane_type? No there are still conflicts: 9, 10 and 13 as spaces (vs controls) and 123, 124 and 126 as regex/pathspec special (vs punctuation). > By the way, I'm working on a patch series for implementing a lot more > character classes with table lookups. It grew out of a desire to make > bad_ref_char() faster but perhaps got a bit out of hand by now; it's at 24 > patches and still not finished. I'm curious how long we have until it > escapes. ;-) I don't think the series is going to graduate any time soon :) -- Duy -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 02/12] ctype: support iscntrl, ispunct, isxdigit and isprint
Am 14.10.2012 04:35, schrieb Nguyễn Thái Ngọc Duy: Signed-off-by: Nguyễn Thái Ngọc Duy --- ctype.c | 18 ++ git-compat-util.h | 13 + 2 files changed, 31 insertions(+) diff --git a/ctype.c b/ctype.c index faeaf34..b4bf48a 100644 --- a/ctype.c +++ b/ctype.c @@ -26,6 +26,24 @@ const unsigned char sane_ctype[256] = { /* Nothing in the 128.. range */ }; +enum { + CN = GIT_CNTRL, + PU = GIT_PUNCT, + XD = GIT_XDIGIT, +}; + +const unsigned char sane_ctype2[256] = { + CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, /* 0..15 */ + CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, /* 16..31 */ + 0, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, /* 32..47 */ + XD, XD, XD, XD, XD, XD, XD, XD, XD, XD, PU, PU, PU, PU, PU, PU, /* 48..63 */ + PU, 0, XD, 0, XD, 0, XD, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64..79 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PU, PU, PU, PU, PU, /* 80..95 */ + PU, 0, XD, 0, XD, 0, XD, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96..111 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PU, PU, PU, PU, CN, /* 112..127 */ Shouldn't [ace] (65, 67, 69) and [ACE] (97, 99, 101) be xdigits as well? But how about using the existing hexval_table instead, like this: #define isxdigit(x) (hexval_table[(x)] != -1) With that, couldn't you squeeze the other two classes into the existing sane_type? By the way, I'm working on a patch series for implementing a lot more character classes with table lookups. It grew out of a desire to make bad_ref_char() faster but perhaps got a bit out of hand by now; it's at 24 patches and still not finished. I'm curious how long we have until it escapes. ;-) #define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL) +#define iscntrl(x) sane_istest2(x, GIT_CNTRL) +#define ispunct(x) sane_istest2(x, GIT_PUNCT) +#define isxdigit(x) sane_istest2(x, GIT_XDIGIT) +#define isprint(x) (isalnum(x) || isspace(x) || ispunct(x)) If a single table is used, you can do with a single table lookup by adding the bits for the component classes, like isalnum and is_regex_special do. René -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 02/12] ctype: support iscntrl, ispunct, isxdigit and isprint
On Sun, Oct 14, 2012 at 12:02 PM, Junio C Hamano wrote: > Nguyễn Thái Ngọc Duy writes: > >> Signed-off-by: Nguyễn Thái Ngọc Duy >> --- > > The description to justify why it is ctype2[] seems to have been > lost. Intended? Nope. I added the description after generating patches and forgot to update the same to my branch. Thanks for catching. -- Duy -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 02/12] ctype: support iscntrl, ispunct, isxdigit and isprint
Nguyễn Thái Ngọc Duy writes: > Signed-off-by: Nguyễn Thái Ngọc Duy > --- The description to justify why it is ctype2[] seems to have been lost. Intended? > ctype.c | 18 ++ > git-compat-util.h | 13 + > 2 files changed, 31 insertions(+) > > diff --git a/ctype.c b/ctype.c > index faeaf34..b4bf48a 100644 > --- a/ctype.c > +++ b/ctype.c > @@ -26,6 +26,24 @@ const unsigned char sane_ctype[256] = { > /* Nothing in the 128.. range */ > }; > > +enum { > + CN = GIT_CNTRL, > + PU = GIT_PUNCT, > + XD = GIT_XDIGIT, > +}; > + > +const unsigned char sane_ctype2[256] = { > + CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, /* > 0..15 */ > + CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, CN, /* > 16..31 */ > + 0, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, PU, /* > 32..47 */ > + XD, XD, XD, XD, XD, XD, XD, XD, XD, XD, PU, PU, PU, PU, PU, PU, /* > 48..63 */ > + PU, 0, XD, 0, XD, 0, XD, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* > 64..79 */ > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PU, PU, PU, PU, PU, /* > 80..95 */ > + PU, 0, XD, 0, XD, 0, XD, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* > 96..111 */ > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PU, PU, PU, PU, CN, /* > 112..127 */ > + /* Nothing in the 128.. range */ > +}; > + > /* For case-insensitive kwset */ > const char tolower_trans_tbl[256] = { > 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, > diff --git a/git-compat-util.h b/git-compat-util.h > index f8b859c..ea11694 100644 > --- a/git-compat-util.h > +++ b/git-compat-util.h > @@ -510,14 +510,23 @@ extern const char tolower_trans_tbl[256]; > #undef isupper > #undef tolower > #undef toupper > +#undef iscntrl > +#undef ispunct > +#undef isxdigit > +#undef isprint > extern const unsigned char sane_ctype[256]; > +extern const unsigned char sane_ctype2[256]; > #define GIT_SPACE 0x01 > #define GIT_DIGIT 0x02 > #define GIT_ALPHA 0x04 > #define GIT_GLOB_SPECIAL 0x08 > #define GIT_REGEX_SPECIAL 0x10 > #define GIT_PATHSPEC_MAGIC 0x20 > +#define GIT_CNTRL 0x01 > +#define GIT_PUNCT 0x02 > +#define GIT_XDIGIT 0x04 > #define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0) > +#define sane_istest2(x,mask) ((sane_ctype2[(unsigned char)(x)] & (mask)) != > 0) > #define isascii(x) (((x) & ~0x7f) == 0) > #define isspace(x) sane_istest(x,GIT_SPACE) > #define isdigit(x) sane_istest(x,GIT_DIGIT) > @@ -527,6 +536,10 @@ extern const unsigned char sane_ctype[256]; > #define isupper(x) sane_iscase(x, 0) > #define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL) > #define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | > GIT_REGEX_SPECIAL) > +#define iscntrl(x) sane_istest2(x, GIT_CNTRL) > +#define ispunct(x) sane_istest2(x, GIT_PUNCT) > +#define isxdigit(x) sane_istest2(x, GIT_XDIGIT) > +#define isprint(x) (isalnum(x) || isspace(x) || ispunct(x)) > #define tolower(x) sane_case((unsigned char)(x), 0x20) > #define toupper(x) sane_case((unsigned char)(x), 0) > #define is_pathspec_magic(x) sane_istest(x,GIT_PATHSPEC_MAGIC) -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html