Patch to support extended characters in C/C++ identifiers

Lewis Hyatt Mon, 12 Aug 2019 15:02:04 -0700

Hello-

The attached patch for libcpp adds support for extended characters (e.g. UTF-8)
in identifiers. A preliminary version of the patch was posted on PR c/67224 as
Comment 26 (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67224#c26) and
discussed with Joseph Myers. Here is an updated patch incorporating all
feedback received so far. I hope it is suitable now; please let me know if I
can do anything else to make it ready for you to apply. I am happy to work on
it further, whatever is needed. I can't easily test on anything other than
x86_64-linux though. I did bootstrap all languages and run all tests on that
platform, everything was good.


The (relatively short) changes to libcpp are included inline here. I attached
the test cases as a gzipped patch to avoid any problems with the encoding (the
test cases contain some invalid UTF-8 and also other encodings such as latin-1
as part of the testing).

Thanks for taking a look at it!

-Lewis

libcpp/ChangeLog

        PR c/67224
        * charset.c (_cpp_valid_utf8): New function to help lex UTF-8 tokens.
        * internal.h (_cpp_valid_utf8): Declare.
        * lex.c (forms_identifier_p): Use it to recognize UTF-8 identifiers.
        (_cpp_lex_direct): Handle UTF-8 in identifiers and CPP_OTHER tokens.
        Do all work in "default" case to avoid slowing down typical code paths.
        Also handle $ and UCN in the default case for consistency.

gcc/testsuite/ChangeLog

        PR c/67224
        * c-c++-common/cpp/ucnid-2011-1-utf8.c: New test.
        * g++.dg/cpp/ucnid-1-utf8.C: New test.
        * g++.dg/cpp/ucnid-2-utf8.C: New test.
        * g++.dg/cpp/ucnid-3-utf8.C: New test.
        * g++.dg/cpp/ucnid-4-utf8.C: New test.
        * g++.dg/other/ucnid-1-utf8.C: New test.
        * gcc.dg/cpp/ucnid-1-utf8.c: New test.
        * gcc.dg/cpp/ucnid-10-utf8.c: New test.
        * gcc.dg/cpp/ucnid-11-utf8.c: New test.
        * gcc.dg/cpp/ucnid-12-utf8.c: New test.
        * gcc.dg/cpp/ucnid-13-utf8.c: New test.
        * gcc.dg/cpp/ucnid-14-utf8.c: New test.
        * gcc.dg/cpp/ucnid-15-utf8.c: New test.
        * gcc.dg/cpp/ucnid-2-utf8.c: New test.
        * gcc.dg/cpp/ucnid-3-utf8.c: New test.
        * gcc.dg/cpp/ucnid-4-utf8.c: New test.
        * gcc.dg/cpp/ucnid-6-utf8.c: New test.
        * gcc.dg/cpp/ucnid-7-utf8.c: New test.
        * gcc.dg/cpp/ucnid-9-utf8.c: New test.
        * gcc.dg/ucnid-1-utf8.c: New test.
        * gcc.dg/ucnid-10-utf8.c: New test.
        * gcc.dg/ucnid-11-utf8.c: New test.
        * gcc.dg/ucnid-12-utf8.c: New test.
        * gcc.dg/ucnid-13-utf8.c: New test.
        * gcc.dg/ucnid-14-utf8.c: New test.
        * gcc.dg/ucnid-15-utf8.c: New test.
        * gcc.dg/ucnid-16-utf8.c: New test.
        * gcc.dg/ucnid-2-utf8.c: New test.
        * gcc.dg/ucnid-3-utf8.c: New test.
        * gcc.dg/ucnid-4-utf8.c: New test.
        * gcc.dg/ucnid-5-utf8.c: New test.
        * gcc.dg/ucnid-6-utf8.c: New test.
        * gcc.dg/ucnid-7-utf8.c: New test.
        * gcc.dg/ucnid-8-utf8.c: New test.
        * gcc.dg/ucnid-9-utf8.c: New test.

diff --git a/libcpp/charset.c b/libcpp/charset.c
index 8a0e5cbb29b..4f1bee96cee 100644
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@@ -1198,6 +1198,84 @@ convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
   return from;
 }
 
+/*  Performs a similar task as _cpp_valid_ucn, but parses UTF-8-encoded
+    extended characters rather than UCNs.  If the return value is TRUE, then a
+    character was successfully decoded and stored in *CP; *PSTR has been
+    updated to point one past the valid UTF-8 sequence.  Diagnostics may have
+    been emitted if the character parsed is not allowed in the current context.
+    If the return value is FALSE, then *PSTR has not been modified and *CP may
+    equal 0, to indicate that *PSTR does not form a valid UTF-8 sequence, or it
+    may, when processing an identifier in C mode, equal a codepoint that was
+    validly encoded but is not allowed to appear in an identifier.  In either
+    case, no diagnostic is emitted, and the return value of FALSE should cause
+    a new token to be formed.
+
+    Unlike _cpp_valid_ucn, this will never be called when lexing a string; only
+    a potential identifier, or a CPP_OTHER token.  NST is unused in the latter
+    case.
+
+    As in _cpp_valid_ucn, IDENTIFIER_POS is 0 when not in an identifier, 1 for
+    the start of an identifier, or 2 otherwise.  */
+
+extern bool
+_cpp_valid_utf8 (cpp_reader *pfile,
+		 const uchar **pstr,
+		 const uchar *limit,
+		 int identifier_pos,
+		 struct normalize_state *nst,
+		 cppchar_t *cp)
+{
+  const uchar *base = *pstr;
+  size_t inbytesleft = limit - base;
+  if (one_utf8_to_cppchar (pstr, &inbytesleft, cp))
+    {
+      /* No diagnostic here as this byte will rather become a
+	 new token.  */
+      *cp = 0;
+      return false;
+    }
+
+  if (identifier_pos)
+    {
+      switch (ucn_valid_in_identifier (pfile, *cp, nst))
+	{
+
+	case 0:
+	  /* In C++, this is an error for invalid character in an identifier
+	     because logically, the UTF-8 was converted to a UCN during
+	     translation phase 1 (even though we don't physically do it that
+	     way). In C, this byte rather becomes grammatically a separate
+	     token.  */
+
+	  if (CPP_OPTION (pfile, cplusplus))
+	    cpp_error (pfile, CPP_DL_ERROR,
+		       "extended character %.*s is not valid in an identifier",
+		       (int) (*pstr - base), base);
+	  else
+	    {
+	      *pstr = base;
+	      return false;
+	    }
+
+	  break;
+
+	case 2:
+	  if (identifier_pos == 1)
+	    {
+	      /* This is treated the same way in C++ or C99 -- lexed as an
+		 identifier which is then invalid because an identifier is
+		 not allowed to start with this character.  */
+	      cpp_error (pfile, CPP_DL_ERROR,
+	  "extended character %.*s is not valid at the start of an identifier",
+			 (int) (*pstr - base), base);
+	    }
+	  break;
+	}
+    }
+
+  return true;
+}
+
 /* Subroutine of convert_hex and convert_oct.  N is the representation
    in the execution character set of a numeric escape; write it into the
    string buffer TBUF and update the end-of-string pointer therein.  WIDE
@@ -1956,8 +2034,9 @@ cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
 }
 
 /* Convert an identifier denoted by ID and LEN, which might contain
-   UCN escapes, to the source character set, either UTF-8 or
-   UTF-EBCDIC.  Assumes that the identifier is actually a valid identifier.  */
+   UCN escapes or UTF-8 multibyte chars, to the source character set,
+   either UTF-8 or UTF-EBCDIC.  Assumes that the identifier is actually
+   a valid identifier.  */
 cpp_hashnode *
 _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
 {
diff --git a/libcpp/internal.h b/libcpp/internal.h
index 45167a9500e..d2158426b1f 100644
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -777,6 +777,14 @@ extern bool _cpp_valid_ucn (cpp_reader *, const unsigned char **,
 			    cppchar_t *,
 			    source_range *char_range,
 			    cpp_string_location_reader *loc_reader);
+
+extern bool _cpp_valid_utf8 (cpp_reader *pfile,
+			     const uchar **pstr,
+			     const uchar *limit,
+			     int identifier_pos,
+			     struct normalize_state *nst,
+			     cppchar_t *cp);
+
 extern void _cpp_destroy_iconv (cpp_reader *);
 extern unsigned char *_cpp_convert_input (cpp_reader *, const char *,
 					  unsigned char *, size_t, size_t,
diff --git a/libcpp/lex.c b/libcpp/lex.c
index 16ded6e9b05..15b10cb3f01 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -1313,7 +1313,9 @@ warn_about_normalization (cpp_reader *pfile,
     }
 }
 
-/* Returns TRUE if the sequence starting at buffer->cur is invalid in
+static const cppchar_t utf8_signifier = 0xC0;
+
+/* Returns TRUE if the sequence starting at buffer->cur is valid in
    an identifier.  FIRST is TRUE if this starts an identifier.  */
 static bool
 forms_identifier_p (cpp_reader *pfile, int first,
@@ -1336,17 +1338,25 @@ forms_identifier_p (cpp_reader *pfile, int first,
       return true;
     }
 
-  /* Is this a syntactically valid UCN?  */
-  if (CPP_OPTION (pfile, extended_identifiers)
-      && *buffer->cur == '\\'
-      && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
+  /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
+  if (CPP_OPTION (pfile, extended_identifiers))
     {
       cppchar_t s;
-      buffer->cur += 2;
-      if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
-			  state, &s, NULL, NULL))
-	return true;
-      buffer->cur -= 2;
+      if (*buffer->cur >= utf8_signifier)
+	{
+	  if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
+			       state, &s))
+	    return true;
+	}
+      else if (*buffer->cur == '\\'
+	       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
+	{
+	  buffer->cur += 2;
+	  if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
+			      state, &s, NULL, NULL))
+	    return true;
+	  buffer->cur -= 2;
+	}
     }
 
   return false;
@@ -1464,7 +1474,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
   pfile->buffer->cur = cur;
   if (starts_ucn || forms_identifier_p (pfile, false, nst))
     {
-      /* Slower version for identifiers containing UCNs (or $).  */
+      /* Slower version for identifiers containing UCNs
+	 or extended chars (including $).  */
       do {
 	while (ISIDNUM (*pfile->buffer->cur))
 	  {
@@ -3117,12 +3128,12 @@ _cpp_lex_direct (cpp_reader *pfile)
       /* @ is a punctuator in Objective-C.  */
     case '@': result->type = CPP_ATSIGN; break;
 
-    case '$':
-    case '\\':
+    default:
       {
 	const uchar *base = --buffer->cur;
-	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
 
+	/* Check for an extended identifier ($ or UCN or UTF-8).  */
+	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
 	if (forms_identifier_p (pfile, true, &nst))
 	  {
 	    result->type = CPP_NAME;
@@ -3131,13 +3142,21 @@ _cpp_lex_direct (cpp_reader *pfile)
 	    warn_about_normalization (pfile, result, &nst);
 	    break;
 	  }
+
+	/* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
+	   single token.  */
 	buffer->cur++;
+	if (c >= utf8_signifier)
+	  {
+	    const uchar *pstr = base;
+	    cppchar_t s;
+	    if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
+	      buffer->cur = pstr;
+	  }
+	create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
+	break;
       }
-      /* FALLTHRU */
 
-    default:
-      create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
-      break;
     }
 
   /* Potentially convert the location of the token to a range.  */

utf8-identifiers-2.patch.gz
Description: application/gunzip

Patch to support extended characters in C/C++ identifiers

Reply via email to