Hi,
This is a second try for the patch. It breaks format-c-5 test, because
it converts every string to UTF-8. The error is
"""
xgettext: Non-ASCII string at ../tests/format-c-5-prg.c:60.
Please specify the source encoding through --from-code.
FAIL: format-c-5
""""
because the sixtieth line of gettext-tools/tests/format-c-5-prg.c is
"""
expected_result = "Vater von \xdb\xb5 Kindern";
"""
that is not a valid ASCII string. With this patch, every string is
translated to UTF-8, so this string abort() xgettext.
Adding --from-code=UTF-8 in the test is a workaround, although I do not
have any actual idea right now about how to solve it. :(
Happy Hacking!
Miguel
diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog
index f188106..9b3a93a 100644
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,3 +1,30 @@
+2013-02-17 Miguel Angel Arruga Vivas <[email protected]>
+
+ Add support for Unicode escaped sequences in x-c.c based in
+ x-java.c Unicode support.
+ * x-c.c (po-charset.h): Included for 'po_charset_utf8'.
+ (unistr.h): Included for 'ucs4_t'.
+ (comment_line_end): Store comments in UTF-8.
+ (check_unicode_codepoint): New function.
+ (P7_UNICODE4): New macro.
+ (P7_UNICODE8): Likewise.
+ (phase7_getc): Add new case for 'u'/'U'.
+ (extract_unicode_codepoint): New function.
+ (mixed_string_buffer): New struct. Code from x-python.c without
+ UTF-16 surrogates support.
+ (mixed_string_buffer_init): New function. Code from x-python.c
+ without UTF-16 surrogates support.
+ (mixed_string_buffer_free): Likewise.
+ (mixed_string_buffer_append_byte): Likewise.
+ (mixed_string_buffer_append_unicode_grow): Likewise.
+ (mixed_string_buffer_append_flush): Likewise.
+ (mixed_string_buffer_append_unicode): Likewise.
+ (mixed_string_buffer_append): Likewise.
+ (mixed_string_buffer_result): Likewise.
+ (phase5_get): Store UTF-8 string literals and identifiers.
+ (extract_parenthesized): Change 'xgettext_current_source_encoding'
+ to 'po_charset_utf8'.
+
2013-02-06 Miguel Angel Arruga Vivas <[email protected]>
GtkBuilder support in xgettext.
diff --git a/gettext-tools/src/x-c.c b/gettext-tools/src/x-c.c
index ea0a874..24798b4 100644
--- a/gettext-tools/src/x-c.c
+++ b/gettext-tools/src/x-c.c
@@ -36,6 +36,8 @@
#include "xalloc.h"
#include "xvasprintf.h"
#include "hash.h"
+#include "po-charset.h"
+#include "unistr.h"
#include "gettext.h"
#define _(s) gettext(s)
@@ -711,6 +713,7 @@ comment_add (int c)
static inline void
comment_line_end (size_t chars_to_remove)
{
+ char *utf8_buffer;
buflen -= chars_to_remove;
while (buflen >= 1
&& (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
@@ -721,7 +724,11 @@ comment_line_end (size_t chars_to_remove)
buffer = xrealloc (buffer, bufmax);
}
buffer[buflen] = '\0';
- savable_comment_add (buffer);
+ utf8_buffer = from_current_source_encoding (buffer, lc_comment,
+ logical_file_name, line_number);
+ savable_comment_add (utf8_buffer);
+ if (utf8_buffer != buffer)
+ free (utf8_buffer);
}
@@ -857,6 +864,40 @@ struct token_ty
int line_number;
};
+/* Check the number of digits of an escaped unicode codepoint.
+ FOUR_DIGITS: True when it is '\u' escaped sequence. */
+static bool
+check_unicode_codepoint (bool four_digits)
+{
+ int stored[8];
+ int i, j, n;
+
+ for (i = 0; i < 8; ++i)
+ {
+ if (four_digits && (i < 4))
+ stored[i] = 0;
+ else
+ {
+ stored[i] = phase3_getc ();
+ switch (stored[i])
+ {
+ default:
+ /* Fallback. */
+ n = four_digits ? 4 : 0;
+ for (j = i; j >= n; --j)
+ phase3_ungetc (stored[j]);
+ return false;
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+ break;
+ }
+ }
+ }
+ return true;
+}
/* 7. Replace escape sequences within character strings with their
single character equivalents. This is called from phase 5, because
@@ -867,6 +908,8 @@ struct token_ty
#define P7_QUOTES (1000 + '"')
#define P7_QUOTE (1000 + '\'')
#define P7_NEWLINE (1000 + '\n')
+#define P7_UNICODE4 (1000 + 'u')
+#define P7_UNICODE8 (1000 + 'U')
static int
phase7_getc ()
@@ -998,6 +1041,18 @@ phase7_getc ()
}
phase3_ungetc (c);
return n;
+
+ /* Unicode support. */
+ case 'u':case 'U':
+ if (!check_unicode_codepoint (c == 'u'))
+ {
+ phase3_ungetc (c);
+ return '\\';
+ }
+ else if (c == 'u')
+ return P7_UNICODE4;
+ else
+ return P7_UNICODE8;
}
}
@@ -1020,6 +1075,218 @@ free_token (token_ty *tp)
drop_reference (tp->comment);
}
+/* Unicode support. */
+
+static ucs4_t
+extract_unicode_codepoint (bool four_digits)
+{
+ int stored[8];
+ int i;
+ ucs4_t uc = 0;
+
+ for (i = 0; i < 8; i++)
+ {
+ if (four_digits && (i < 4))
+ stored[i] = 0;
+ else
+ {
+ stored[i] = phase3_getc ();
+ switch (stored[i])
+ {
+ default:
+ /* This should be called pointing to a valid unicode
+ escaped sequence. */
+ abort ();
+ return 0;
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ uc = uc * 16 + stored[i] - '0';
+ break;
+
+ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+ uc = uc * 16 + 10 + stored[i] - 'A';
+ break;
+
+ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+ uc = uc * 16 + 10 + stored[i] - 'a';
+ break;
+ }
+ }
+ }
+ return uc;
+}
+
+/* A string buffer type that allows appending bytes (in the
+ xgettext_current_source_encoding) or Unicode characters.
+ Returns the entire string in UTF-8 encoding. */
+struct mixed_string_buffer
+{
+ /* The part of the string in local encoding. */
+ char *buffer;
+ size_t bufmax;
+ size_t buflen;
+ /* The part of the string converted to UTF-8. */
+ char *utf8_buffer;
+ size_t utf8_bufmax;
+ size_t utf8_buflen;
+ /* The lexical context. Used only for error message purposes. */
+ lexical_context_ty lcontext;
+};
+
+/* Initialize a 'struct mixed_string_buffer' to empty. */
+static void
+mixed_string_buffer_init (struct mixed_string_buffer *bp,
+ lexical_context_ty lcontext)
+{
+ bp->buffer = NULL;
+ bp->bufmax = 0;
+ bp->buflen = 0;
+ bp->utf8_buffer = NULL;
+ bp->utf8_bufmax = 0;
+ bp->utf8_buflen = 0;
+ bp->lcontext = lcontext;
+}
+
+/* Destroy the content of a 'struct mixed_string_buffer'. */
+static void
+mixed_string_buffer_free (struct mixed_string_buffer *bp)
+{
+ if (bp->buffer != NULL)
+ free (bp->buffer);
+ if (bp->utf8_buffer != NULL)
+ free (bp->utf8_buffer);
+ bp->buffer = NULL;
+ bp->utf8_buffer = NULL;
+}
+
+/* Auxiliary function: Append a byte to bp->buffer. */
+static void
+mixed_string_buffer_append_byte (struct mixed_string_buffer *bp, int c)
+{
+ if (bp->buflen == bp->bufmax)
+ {
+ bp->bufmax = 2 * bp->bufmax + 10;
+ bp->buffer = xrealloc (bp->buffer, bp->bufmax);
+ }
+ bp->buffer[bp->buflen++] = c;
+}
+
+
+/* Auxiliary function: Ensure count more bytes are available in bp->utf8. */
+static inline void
+mixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp,
+ size_t count)
+{
+ if (bp->utf8_buflen + count > bp->utf8_bufmax)
+ {
+ size_t new_allocated = 2 * bp->utf8_bufmax + 10;
+ if (new_allocated < bp->utf8_buflen + count)
+ new_allocated = bp->utf8_buflen + count;
+ bp->utf8_bufmax = new_allocated;
+ bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
+ }
+}
+
+/* Auxiliary function: Flush bp->buffer into bp->utf8_buffer. */
+static inline void
+mixed_string_buffer_flush (struct mixed_string_buffer *bp,
+ int lineno)
+{
+ if (bp->buflen > 0)
+ {
+ char *curr;
+ size_t count;
+
+ mixed_string_buffer_append_byte (bp, '\0');
+
+ /* Convert from the source encoding to UTF-8. */
+ curr = from_current_source_encoding (bp->buffer, bp->lcontext,
+ logical_file_name, lineno);
+
+ /* Append it to bp->utf8_buffer. */
+ count = strlen (curr);
+ mixed_string_buffer_append_unicode_grow (bp, count);
+ memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
+ bp->utf8_buflen += count;
+
+ if (curr != bp->buffer)
+ free (curr);
+ bp->buflen = 0;
+ }
+}
+
+/* Auxiliary function: Append a Unicode character to bp->utf8.
+ uc must be < 0x110000. */
+static void
+mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, ucs4_t uc)
+{
+ unsigned char utf8buf[6];
+ int count = u8_uctomb (utf8buf, uc, 6);
+
+ if (count < 0)
+ /* The caller should have ensured that uc is not out-of-range. */
+ abort ();
+
+ mixed_string_buffer_append_unicode_grow (bp, count);
+ memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
+ bp->utf8_buflen += count;
+}
+
+/* Append a character or Unicode character to a 'struct mixed_string_buffer'. */
+static void
+mixed_string_buffer_append (struct mixed_string_buffer *bp, int c)
+{
+ if (c == P7_UNICODE4 || c == P7_UNICODE8)
+ {
+ /* Append a Unicode character. */
+ ucs4_t uc = extract_unicode_codepoint (c == P7_UNICODE4);
+ /* Switch from multibyte character mode to Unicode character mode. */
+ mixed_string_buffer_flush (bp, line_number);
+
+ if (uc >= 0x110000
+ || (uc > 0 && uc < 0x20)
+ || (uc < 0xa0 && uc >= 0x7f)
+ || (uc < 0xe000 && uc >= 0xd800))
+ {
+ /* GCC will not compile this source file.
+ Any reason to actually extract this string? */
+ error_with_progname = false;
+ error (0, 0, _("%s:%d: warning: invalid universal character"),
+ logical_file_name, line_number - 1);
+ error_with_progname = true;
+ mixed_string_buffer_append_unicode (bp, 0xfffd);
+ }
+ else
+ mixed_string_buffer_append_unicode (bp, uc);
+ }
+ else
+ {
+ /* Append a single byte. */
+
+ /* When a newline is seen, convert the accumulated multibyte sequence.
+ This ensures a correct line number in the error message in case of
+ a conversion error. The "- 1" is to account for the newline. */
+ if (c == '\n')
+ mixed_string_buffer_flush (bp, line_number - 1);
+
+ mixed_string_buffer_append_byte (bp, c);
+ }
+}
+
+/* Return the string buffer's contents. */
+static char *
+mixed_string_buffer_result (struct mixed_string_buffer *bp)
+{
+ /* Flush all into bp->utf8_buffer. */
+ mixed_string_buffer_flush (bp, line_number);
+ /* NUL-terminate it. */
+ mixed_string_buffer_append_unicode_grow (bp, 1);
+ bp->utf8_buffer[bp->utf8_buflen] = '\0';
+ /* Return it. */
+ return bp->utf8_buffer;
+}
+
/* 5. Parse each resulting logical line as preprocessing tokens and
white space. Preprocessing tokens and C tokens don't always match. */
@@ -1033,6 +1300,7 @@ phase5_get (token_ty *tp)
{
static char *buffer;
static int bufmax;
+ struct mixed_string_buffer msbuffer;
int bufpos;
int c;
@@ -1086,15 +1354,11 @@ phase5_get (token_ty *tp)
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
case 'v': case 'w': case 'x': case 'y': case 'z':
- bufpos = 0;
+ /* FIXME: \uXXXX or \UXXXXXXXX are valid in the identifier. */
+ mixed_string_buffer_init (&msbuffer, lc_outside);
for (;;)
{
- if (bufpos >= bufmax)
- {
- bufmax = 2 * bufmax + 10;
- buffer = xrealloc (buffer, bufmax);
- }
- buffer[bufpos++] = c;
+ mixed_string_buffer_append (&msbuffer, c);
c = phase4_getc ();
switch (c)
{
@@ -1119,14 +1383,9 @@ phase5_get (token_ty *tp)
}
break;
}
- if (bufpos >= bufmax)
- {
- bufmax = 2 * bufmax + 10;
- buffer = xrealloc (buffer, bufmax);
- }
- buffer[bufpos] = 0;
- tp->string = xstrdup (buffer);
+ tp->string = xstrdup (mixed_string_buffer_result (&msbuffer));
tp->type = token_type_name;
+ mixed_string_buffer_free (&msbuffer);
return;
case '.':
@@ -1237,7 +1496,7 @@ phase5_get (token_ty *tp)
but since gettext's argument is not a wide character string,
let the compiler complain about the argument not matching the
prototype. Just pretend it won't happen. */
- bufpos = 0;
+ mixed_string_buffer_init (&msbuffer, lc_string);
for (;;)
{
c = phase7_getc ();
@@ -1254,22 +1513,12 @@ phase5_get (token_ty *tp)
break;
if (c == P7_QUOTE)
c = '\'';
- if (bufpos >= bufmax)
- {
- bufmax = 2 * bufmax + 10;
- buffer = xrealloc (buffer, bufmax);
- }
- buffer[bufpos++] = c;
- }
- if (bufpos >= bufmax)
- {
- bufmax = 2 * bufmax + 10;
- buffer = xrealloc (buffer, bufmax);
+ mixed_string_buffer_append (&msbuffer, c);
}
- buffer[bufpos] = 0;
tp->type = token_type_string_literal;
- tp->string = xstrdup (buffer);
+ tp->string = xstrdup (mixed_string_buffer_result (&msbuffer));
tp->comment = add_reference (savable_comment);
+ mixed_string_buffer_free (&msbuffer);
return;
case '(':
@@ -1843,7 +2092,10 @@ extract_parenthesized (message_list_ty *mlp,
arglist_parser_alloc (mlp,
state ? next_shapes : NULL)))
{
+ xgettext_current_source_encoding = po_charset_utf8;
arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding =
+ xgettext_global_source_encoding;
return true;
}
next_context_iter = null_context_list_iterator;
@@ -1852,7 +2104,9 @@ extract_parenthesized (message_list_ty *mlp,
continue;
case xgettext_token_type_rparen:
+ xgettext_current_source_encoding = po_charset_utf8;
arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
return false;
case xgettext_token_type_comma:
@@ -1886,6 +2140,7 @@ extract_parenthesized (message_list_ty *mlp,
continue;
case xgettext_token_type_string_literal:
+ xgettext_current_source_encoding = po_charset_utf8;
if (extract_all)
remember_a_message (mlp, NULL, token.string, inner_context,
&token.pos, NULL, token.comment);
@@ -1894,6 +2149,7 @@ extract_parenthesized (message_list_ty *mlp,
inner_context,
token.pos.file_name, token.pos.line_number,
token.comment);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
drop_reference (token.comment);
next_context_iter = null_context_list_iterator;
selectorcall_context_iter = null_context_list_iterator;
@@ -1907,7 +2163,9 @@ extract_parenthesized (message_list_ty *mlp,
continue;
case xgettext_token_type_eof:
+ xgettext_current_source_encoding = po_charset_utf8;
arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
return true;
default: