Fwd: cpp: Add UTF-16/UTF-32 encoding auto-detection in C preprocessor

katahiromz Wed, 26 Nov 2025 17:50:29 -0800

片山博文MZ

---------- Forwarded message ---------
From: katahiromz <[email protected]>
Date: 2025年11月27日(木) 10:39
Subject: Re: cpp: Add UTF-16/UTF-32 encoding auto-detection in C
preprocessor
To: Joseph Myers <[email protected]>

Dear Joseph,

Thank you for your feedback. I understand that UTF-16 and UTF-32 are
not conventional for Unix-like systems, but I'd like to share some
context about GCC usage on Windows.

GCC is widely used on Windows through MinGW, MinGW-w64, MSYS2, and
Cygwin. In Windows development environments, it's common to encounter
source files in mixed encodings (UTF-8 and UTF-16) because:

1. Windows APIs and tools (including Visual Studio) have historically
used UTF-16 as the native Unicode encoding
2. Many legacy Windows projects contain UTF-16 encoded source files
3. Developers working in cross-platform environments often need to
compile code that originated from Windows-centric projects

The lack of UTF-16/UTF-32 support in GCC creates friction for Windows
developers, contributing to Visual Studio maintaining market share in
that ecosystem. Supporting these encodings would make GCC more
competitive for Windows development.

Based on your feedback, I have updated the patch to:

1. Add a new -fauto-detect-utf16-utf32 command-line option that
enables this feature only when explicitly requested (disabled by
default, preserving existing behavior)
2. Update gcc/doc/cppopts.texi with documentation for this new option

This approach ensures backward compatibility while giving Windows
developers the option to enable UTF-16/UTF-32 auto-detection when
needed.

2025年11月27日(木) 9:39 Joseph Myers <[email protected]>:
>
> On Thu, 27 Nov 2025, katahiromz wrote:
>
> > Hello, I'm katahiromz. Thank you for your great software.
> > I want to add UTF-16/UTF-32 support to your C preprocessor.
> >
> > This patch (attached) might add automatic character encoding detection
> > to `libcpp/files.cc` by examining the first 4 bytes of input files.
> > I hope this patch helps.
>
> It's not conventional for Unix-like utilities accepting text files to do
> this sort of thing; text files are expected to be in the locale's
> encoding, or at least in some ASCII-compatible encoding; UTF-16 and UTF-32
> are binary files, and in practice the world utilities such as GCC operate
> in has settled on UTF-8 as the standard Unicode encoding for input and
> output.  Furthermore, the existing logic is documented in cppopts.texi,
> which this patch doesn't change.
>
> --
> Joseph S. Myers
> [email protected]
>

diff --git a/gcc/c-family/c-opts.cc b/gcc/c-family/c-opts.cc
index 8da51759dfd33..aa91b0cb62013 100644
--- a/gcc/c-family/c-opts.cc
+++ b/gcc/c-family/c-opts.cc
@@ -565,6 +565,10 @@ c_common_handle_option (size_t scode, const char *arg, HOST_WIDE_INT value,
       cpp_opts->cpp_input_charset_explicit = 1;
       break;
 
+    case OPT_fauto_detect_utf16_utf32:
+      cpp_opts->auto_detect_utf16_utf32 = value;
+      break;
+
     case OPT_ftemplate_depth_:
       max_tinst_depth = value;
       break;
diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
index e1576c9c3c377..5d455156b430e 100644
--- a/gcc/c-family/c.opt
+++ b/gcc/c-family/c.opt
@@ -1743,6 +1743,10 @@ fassume-sane-operators-new-delete
 C++ ObjC++ Optimization Var(flag_assume_sane_operators_new_delete) Init(1)
 Assume C++ replaceable global operators new, new[], delete, delete[] don't read or write visible global state.
 
+fauto-detect-utf16-utf32
+C ObjC C++ ObjC++ Var(flag_auto_detect_utf16_utf32)
+Enable automatic detection of UTF-16 and UTF-32 encoding in source files.
+
 ; Define extra predefined macros for use in libgcc.
 fbuilding-libgcc
 C ObjC C++ ObjC++ Undocumented Var(flag_building_libgcc)
diff --git a/gcc/doc/cppopts.texi b/gcc/doc/cppopts.texi
index 2df5e9db38fd0..67275ad311c23 100644
--- a/gcc/doc/cppopts.texi
+++ b/gcc/doc/cppopts.texi
@@ -345,6 +345,24 @@ location independent.  This option also affects
 @code{__builtin_FILE()} during compilation.  See also
 @option{-ffile-prefix-map} and @option{-fcanon-prefix-map}.
 
+@opindex fauto-detect-utf16-utf32
+@cindex character set, automatic detection
+@item -fauto-detect-utf16-utf32
+Enable automatic detection of UTF-16 and UTF-32 encoding in source files.
+When this option is enabled, the preprocessor examines the first few bytes
+of each source file to detect the character encoding.  Detection is based on:
+@itemize @bullet
+@item
+Byte Order Mark (BOM): UTF-32LE (@code{FF FE 00 00}), UTF-32BE
+(@code{00 00 FE FF}), UTF-16LE (@code{FF FE}), UTF-16BE (@code{FE FF}).
+@item
+Null byte patterns for files without BOM: alternating null bytes suggest
+UTF-16, while three consecutive null bytes suggest UTF-32.
+@end itemize
+If a BOM is detected, it is automatically stripped from the input.
+This option allows mixing source files with different encodings
+(e.g., including a UTF-8 header from a UTF-16 source file).
+
 @opindex fexec-charset
 @cindex character set, execution
 @item -fexec-charset=@var{charset}
diff --git a/libcpp/files.cc b/libcpp/files.cc
index d80c4bfd90775..bad9fb356acac 100644
--- a/libcpp/files.cc
+++ b/libcpp/files.cc
@@ -710,6 +710,105 @@ _cpp_find_file (cpp_reader *pfile, const char *fname, cpp_dir *start_dir,
   return file;
 }
 
+/* Detect input file encoding from first 4 bytes and return the charset name.
+   Also returns the BOM length to skip in *BOM_LEN.
+
+   Detection logic:
+   1) If first 4 bytes are all zero -> binary file (returns NULL)
+   2) BOM detection:
+      - UTF-32 LE BOM: 0xFF 0xFE 0x00 0x00 -> "UTF-32LE"
+      - UTF-32 BE BOM: 0x00 0x00 0xFE 0xFF -> "UTF-32BE"
+      - UTF-16 LE BOM: 0xFF 0xFE (not followed by 0x00 0x00) -> "UTF-16LE"
+      - UTF-16 BE BOM: 0xFE 0xFF -> "UTF-16BE"
+      - UTF-8 BOM: 0xEF 0xBB 0xBF -> "UTF-8"
+   3) Null byte pattern detection (no BOM):
+      - bytes[1]==0 && bytes[3]==0 -> "UTF-16LE"
+      - bytes[0]==0 && bytes[2]==0 -> "UTF-16BE"
+      - bytes[2]==0 && bytes[3]==0 -> "UTF-32LE"
+      - bytes[0]==0 && bytes[1]==0 -> "UTF-32BE"
+   4) Otherwise, return NULL (use the provided input_charset).
+
+   If file is less than 4 bytes, only applicable checks are performed.  */
+
+static const char *
+detect_encoding (const uchar *buf, ssize_t len, size_t *bom_len)
+{
+  *bom_len = 0;
+
+  if (len < 1)
+    return NULL;
+
+  /* Check for binary file (all first 4 bytes are zero).  */
+  if (len >= 4
+      && buf[0] == 0 && buf[1] == 0 && buf[2] == 0 && buf[3] == 0)
+    return "BINARY";
+
+  /* Check for UTF-32 LE BOM: 0xFF 0xFE 0x00 0x00
+     Must check before UTF-16 LE BOM since it starts with 0xFF 0xFE.  */
+  if (len >= 4
+      && buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00)
+    {
+      *bom_len = 4;
+      return "UTF-32LE";
+    }
+
+  /* Check for UTF-32 BE BOM: 0x00 0x00 0xFE 0xFF.  */
+  if (len >= 4
+      && buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF)
+    {
+      *bom_len = 4;
+      return "UTF-32BE";
+    }
+
+  /* Check for UTF-16 LE BOM: 0xFF 0xFE (not followed by 0x00 0x00).  */
+  if (len >= 2 && buf[0] == 0xFF && buf[1] == 0xFE)
+    {
+      *bom_len = 2;
+      return "UTF-16LE";
+    }
+
+  /* Check for UTF-16 BE BOM: 0xFE 0xFF.  */
+  if (len >= 2 && buf[0] == 0xFE && buf[1] == 0xFF)
+    {
+      *bom_len = 2;
+      return "UTF-16BE";
+    }
+
+  /* Check for UTF-8 BOM: 0xEF 0xBB 0xBF.
+     Note: UTF-8 BOM is handled separately in _cpp_convert_input,
+     so we don't need to strip it here. Just recognize it.  */
+  if (len >= 3 && buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF)
+    {
+      /* UTF-8 BOM is handled by _cpp_convert_input, no need to change charset
+	 or strip BOM here.  */
+      return NULL;
+    }
+
+  /* No BOM found. Try to infer encoding from null byte patterns.
+     Only check if we have at least 4 bytes.  */
+  if (len >= 4)
+    {
+      /* UTF-16 LE: 2nd and 4th bytes are zero (for ASCII-range characters).  */
+      if (buf[1] == 0 && buf[3] == 0 && (buf[0] != 0 || buf[2] != 0))
+	return "UTF-16LE";
+
+      /* UTF-16 BE: 1st and 3rd bytes are zero (for ASCII-range characters).  */
+      if (buf[0] == 0 && buf[2] == 0 && (buf[1] != 0 || buf[3] != 0))
+	return "UTF-16BE";
+
+      /* UTF-32 LE: 2nd, 3rd, and 4th bytes are zero.  */
+      if (buf[1] == 0 && buf[2] == 0 && buf[3] == 0 && buf[0] != 0)
+	return "UTF-32LE";
+
+      /* UTF-32 BE: 1st, 2nd, and 3rd bytes are zero.  */
+      if (buf[0] == 0 && buf[1] == 0 && buf[2] == 0 && buf[3] != 0)
+	return "UTF-32BE";
+    }
+
+  /* No encoding detected, use the provided charset.  */
+  return NULL;
+}
+
 /* Read a file into FILE->buffer, returning true on success.
 
    If FILE->fd is something weird, like a block device, we don't want
@@ -795,9 +894,44 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc,
     cpp_error_at (pfile, CPP_DL_WARNING, loc,
 		  "%s is shorter than expected", file->path);
 
+  /* Auto-detect encoding from first 4 bytes if -fauto-detect-utf16-utf32
+     option is enabled.  */
+  const char *detected_charset = NULL;
+  size_t bom_len = 0;
+
+  if (pfile && CPP_OPTION (pfile, auto_detect_utf16_utf32) && total >= 1)
+    {
+      detected_charset = detect_encoding (buf, total, &bom_len);
+
+      /* Check for binary file.  */
+      if (detected_charset && strcmp (detected_charset, "BINARY") == 0)
+	{
+	  cpp_error_at (pfile, CPP_DL_ERROR, loc,
+			"%s appears to be a binary file", file->path);
+	  free (buf);
+	  return false;
+	}
+    }
+
+  /* Use detected charset if we found one, otherwise use the provided one.  */
+  const char *effective_charset = detected_charset ? detected_charset
+						   : input_charset;
+
+  /* If we have a BOM to skip, adjust the buffer.  */
+  ssize_t convert_len = total;
+
+  if (bom_len > 0 && (size_t)total >= bom_len)
+    {
+      /* Move data to skip the BOM.  We need to adjust the buffer
+	 so _cpp_convert_input doesn't see the BOM.  */
+      convert_len = total - bom_len;
+      memmove (buf, buf + bom_len, convert_len);
+    }
+
   file->buffer = _cpp_convert_input (pfile,
-				     input_charset,
-				     buf, size + pad, total,
+				     effective_charset,
+				     buf, size + pad,
+				     convert_len,
 				     &file->buffer_start,
 				     &file->st.st_size);
   file->buffer_valid = file->buffer;
diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
index 5190ff7d08fec..2ccb8d6eacc00 100644
--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@@ -624,6 +624,9 @@ struct cpp_options
   /* True if -finput-charset= option has been used explicitly.  */
   bool cpp_input_charset_explicit;
 
+  /* True if -fauto-detect-utf16-utf32 option is enabled.  */
+  bool auto_detect_utf16_utf32;
+
   /* True if -Wkeyword-macro.  */
   bool cpp_warn_keyword_macro;

Fwd: cpp: Add UTF-16/UTF-32 encoding auto-detection in C preprocessor

Reply via email to