[bug-gettext] [RFC Patch] Implement \u support in xgettext for C family (C11/C++11)

Miguel Ángel Fri, 08 Feb 2013 16:09:50 -0800

Hello,

I have implemented a very basic support for escaped unicode code points.
I am not very sure if I have to change always
'xgettext_current_source_encoding'. I have looked into x-java.c code. I
also have to extend testsuite, because I have tested it with simple
files and my current make check (with GtkBuilder support).


This is the second point of 'plans' file that I am trying to remove. ;-)

Happy hacking!
Miguel

diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog
index f188106..e579ce8 100644
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,3 +1,20 @@
+2013-02-08  Miguel Angel Arruga Vivas  <[email protected]>
+
+	Add support for Unicode escaped sequences in x-c.c based in
+	x-java.c Unicode support.
+	* x-c.c (po-charset.h): Included file.
+	(unistr.h): Likewise.
+	(P7_UNICODE): New macro.
+	(phase7_getc): Add new case for 'u'/'U'.
+	(skip_unicode_codepoint): New function.
+	(get_unicode_codepoint): New function.
+	(utf8_string_to_store): New boolean.
+	(phase5_get): Skip unicode codepoint in '' strings.
+	Store UTF-8 representation in "" strings.
+	(extract_parenthesized): Change 'xgettext_current_source_encoding'
+	to 'po_charset_utf8' when needed. From x-java.c code.
+	(extract_whole_file): Set 'utf8_string_to_store'.
+
 2013-02-06  Miguel Angel Arruga Vivas  <[email protected]>
 
 	GtkBuilder support in xgettext.
diff --git a/gettext-tools/src/x-c.c b/gettext-tools/src/x-c.c
index ea0a874..12d2f41 100644
--- a/gettext-tools/src/x-c.c
+++ b/gettext-tools/src/x-c.c
@@ -36,6 +36,8 @@
 #include "xalloc.h"
 #include "xvasprintf.h"
 #include "hash.h"
+#include "po-charset.h"
+#include "unistr.h"
 #include "gettext.h"
 
 #define _(s) gettext(s)
@@ -867,6 +869,7 @@ struct token_ty
 #define P7_QUOTES (1000 + '"')
 #define P7_QUOTE (1000 + '\'')
 #define P7_NEWLINE (1000 + '\n')
+#define P7_UNICODE (1000 + 'u')
 
 static int
 phase7_getc ()
@@ -998,6 +1001,26 @@ phase7_getc ()
         }
       phase3_ungetc (c);
       return n;
+
+    /* Unicode support. We keep the u/U in c. */
+    case 'u': case 'U':
+      n = phase3_getc (); /* n stores the next character.  */
+      switch (n)
+	{
+        default:
+	  j = '\\'; /* j stores the result.  */
+	  break;
+
+        case '0': case '1': case '2': case '3': case '4':
+        case '5': case '6': case '7': case '8': case '9':
+        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+	  j = P7_UNICODE;
+          break;
+	}
+      phase3_ungetc (n);
+      phase3_ungetc (c);
+      return j;
     }
 }
 
@@ -1021,12 +1044,107 @@ free_token (token_ty *tp)
 }
 
 
+static void
+skip_unicode_codepoint ()
+{
+  int num_bytes, j;
+  int c = phase3_getc ();
+
+  switch (c)
+    {
+    default:
+      /* This must be called pointing a unicode escaped codepoint.  */
+      abort ();
+    case 'u':
+      num_bytes = 4;
+      break;
+    case 'U':
+      num_bytes = 8;
+      break;
+    }
+
+  for (j = 0; j < num_bytes; ++j)
+    {
+      c = phase3_getc ();
+      switch (c)
+	{
+	default:
+	  /* This must be called pointing a unicode escaped codepoint.  */
+	  abort ();
+	  /*
+	  phase3_ungetc (c);
+	  */
+	  return;
+
+	case '0': case '1': case '2': case '3': case '4':
+	case '5': case '6': case '7': case '8': case '9':
+	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+	  break;
+	}
+    }
+}
+
+/* Returns the codepoint stored in the escaped sequence. */
+static ucs4_t
+get_unicode_codepoint ()
+{
+  int num_bytes, j;
+  ucs4_t n = 0;
+  int c = phase3_getc ();
+
+  switch (c)
+    {
+    default:
+      /* This must be called pointing a unicode escaped codepoint.  */
+      abort ();
+
+    case 'u':
+      num_bytes = 4;
+      break;
+    case 'U':
+      num_bytes = 8;
+      break;
+    }
+
+  for (j = 0; j < num_bytes; ++j)
+    {
+      c = phase3_getc ();
+      switch (c)
+	{
+	default:
+	  /* This must be called pointing a unicode escaped codepoint.  */
+	  abort ();
+	  /*
+	  phase3_ungetc (c);
+	  */
+	  return n;
+
+	case '0': case '1': case '2': case '3': case '4':
+	case '5': case '6': case '7': case '8': case '9':
+	  n = n * 16 + c - '0';
+	  break;
+
+	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+	  n = n * 16 + 10 + c - 'A';
+	  break;
+
+	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+	  n = n * 16 + 10 + c - 'a';
+	  break;
+	}
+    }
+  return n;
+}
+
 /* 5. Parse each resulting logical line as preprocessing tokens and
    white space.  Preprocessing tokens and C tokens don't always match.  */
 
 static token_ty phase5_pushback[1];
 static int phase5_pushback_length;
 
+/* Set when a \u is found with a non-ASCII character.  */
+static bool utf8_string_to_store;
 
 static void
 phase5_get (token_ty *tp)
@@ -1228,6 +1346,8 @@ phase5_get (token_ty *tp)
             }
           if (c == EOF || c == P7_QUOTE)
             break;
+	  if (c == P7_UNICODE)
+	    skip_unicode_codepoint ();
         }
       tp->type = token_type_character_constant;
       return;
@@ -1254,12 +1374,30 @@ phase5_get (token_ty *tp)
             break;
           if (c == P7_QUOTE)
             c = '\'';
-          if (bufpos >= bufmax)
-            {
-              bufmax = 2 * bufmax + 10;
-              buffer = xrealloc (buffer, bufmax);
-            }
-          buffer[bufpos++] = c;
+	  if (c == P7_UNICODE)
+	    {
+	      unsigned char utf8buf[6];
+	      int count = u8_uctomb (utf8buf, get_unicode_codepoint (), 6);
+
+	      if (bufpos + count >= bufmax)
+		{
+		  bufmax = 2 * bufmax + 10;
+		  buffer = xrealloc (buffer, bufmax);
+		}
+	      memcpy (buffer + bufpos, utf8buf, count);
+	      if (count > 1)
+		utf8_string_to_store = true;
+	      bufpos += count;
+	    }
+	  else
+	    {
+	      if (bufpos >= bufmax)
+		{
+		  bufmax = 2 * bufmax + 10;
+		  buffer = xrealloc (buffer, bufmax);
+		}
+	      buffer[bufpos++] = c;
+	    }
         }
       if (bufpos >= bufmax)
         {
@@ -1843,7 +1981,15 @@ extract_parenthesized (message_list_ty *mlp,
                                      arglist_parser_alloc (mlp,
                                                            state ? next_shapes : NULL)))
             {
+              if (utf8_string_to_store)
+                xgettext_current_source_encoding = po_charset_utf8;
               arglist_parser_done (argparser, arg);
+              if (utf8_string_to_store)
+                {
+                  utf8_string_to_store = false;
+                  xgettext_current_source_encoding =
+                    xgettext_global_source_encoding;
+                }
               return true;
             }
           next_context_iter = null_context_list_iterator;
@@ -1852,7 +1998,15 @@ extract_parenthesized (message_list_ty *mlp,
           continue;
 
         case xgettext_token_type_rparen:
+          if (utf8_string_to_store)
+            xgettext_current_source_encoding = po_charset_utf8;
           arglist_parser_done (argparser, arg);
+          if (utf8_string_to_store)
+            {
+              utf8_string_to_store = false;
+              xgettext_current_source_encoding =
+                xgettext_global_source_encoding;
+            }
           return false;
 
         case xgettext_token_type_comma:
@@ -1886,6 +2040,8 @@ extract_parenthesized (message_list_ty *mlp,
           continue;
 
         case xgettext_token_type_string_literal:
+          if (utf8_string_to_store)
+            xgettext_current_source_encoding = po_charset_utf8;
           if (extract_all)
             remember_a_message (mlp, NULL, token.string, inner_context,
                                 &token.pos, NULL, token.comment);
@@ -1894,6 +2050,12 @@ extract_parenthesized (message_list_ty *mlp,
                                      inner_context,
                                      token.pos.file_name, token.pos.line_number,
                                      token.comment);
+          if (utf8_string_to_store)
+            {
+              utf8_string_to_store = false;
+              xgettext_current_source_encoding =
+                xgettext_global_source_encoding;
+            }
           drop_reference (token.comment);
           next_context_iter = null_context_list_iterator;
           selectorcall_context_iter = null_context_list_iterator;
@@ -1907,7 +2069,15 @@ extract_parenthesized (message_list_ty *mlp,
           continue;
 
         case xgettext_token_type_eof:
+          if (utf8_string_to_store)
+            xgettext_current_source_encoding = po_charset_utf8;
           arglist_parser_done (argparser, arg);
+          if (utf8_string_to_store)
+            {
+              utf8_string_to_store = false;
+              xgettext_current_source_encoding =
+                xgettext_global_source_encoding;
+            }
           return true;
 
         default:
@@ -1929,6 +2099,7 @@ extract_whole_file (FILE *f,
   real_file_name = real_filename;
   logical_file_name = xstrdup (logical_filename);
   line_number = 1;
+  utf8_string_to_store = false;
 
   newline_count = 0;
   last_comment_line = -1;

[bug-gettext] [RFC Patch] Implement \u support in xgettext for C family (C11/C++11)

Reply via email to