[Mingw-w64-public] Add UTF-16 support to C preprocessor

katahiromz Tue, 02 Dec 2025 15:32:30 -0800

Hello, I'm katahiromz. Thank you for your great software.
I want to add UTF-16/UTF-32 support to your C preprocessor.


This patch to gcc might add automatic character encoding detection
to `libcpp/files.cc` by examining the first 4 bytes of input files.
I hope this patch helps.

---
Technical information:

**Detection logic in `read_file_guts`:**

- Binary files (all zeros in first 4 bytes) --> error
- BOM detection:
  - `FF FE 00 00` --> UTF-32LE
  - `00 00 FE FF` --> UTF-32BE
  - `FF FE` --> UTF-16LE
  - `FE FF` --> UTF-16BE
  - `EF BB BF` --> UTF-8 (handled by existing code)
- Null byte pattern inference (no BOM):
  - bytes[1]==0 && bytes[3]==0 --> UTF-16LE
  - bytes[0]==0 && bytes[2]==0 --> UTF-16BE
  - bytes[1,2,3]==0 --> UTF-32LE
  - bytes[0,1,2]==0 --> UTF-32BE

**Changes:**
- Added `detect_encoding()` function for BOM/pattern detection
- Modified `read_file_guts()` to auto-detect and strip BOM before conversion

Files less than 4 bytes are processed normally without inference.
---
diff --git a/libcpp/files.cc b/libcpp/files.cc
index d80c4bfd907..2a10752691a 100644
--- a/libcpp/files.cc
+++ b/libcpp/files.cc
@@ -710,6 +710,105 @@ _cpp_find_file (cpp_reader *pfile, const char
*fname, cpp_dir *start_dir,
   return file;
 }

+/* Detect input file encoding from first 4 bytes and return the charset name.
+   Also returns the BOM length to skip in *BOM_LEN.
+
+   Detection logic:
+   1) If first 4 bytes are all zero -> binary file (returns NULL)
+   2) BOM detection:
+      - UTF-32 LE BOM: 0xFF 0xFE 0x00 0x00 -> "UTF-32LE"
+      - UTF-32 BE BOM: 0x00 0x00 0xFE 0xFF -> "UTF-32BE"
+      - UTF-16 LE BOM: 0xFF 0xFE (not followed by 0x00 0x00) -> "UTF-16LE"
+      - UTF-16 BE BOM: 0xFE 0xFF -> "UTF-16BE"
+      - UTF-8 BOM: 0xEF 0xBB 0xBF -> "UTF-8"
+   3) Null byte pattern detection (no BOM):
+      - bytes[1]==0 && bytes[3]==0 -> "UTF-16LE"
+      - bytes[0]==0 && bytes[2]==0 -> "UTF-16BE"
+      - bytes[2]==0 && bytes[3]==0 -> "UTF-32LE"
+      - bytes[0]==0 && bytes[1]==0 -> "UTF-32BE"
+   4) Otherwise, return NULL (use the provided input_charset).
+
+   If file is less than 4 bytes, only applicable checks are performed.  */
+
+static const char *
+detect_encoding (const uchar *buf, ssize_t len, size_t *bom_len)
+{
+  *bom_len = 0;
+
+  if (len < 1)
+    return NULL;
+
+  /* Check for binary file (all first 4 bytes are zero).  */
+  if (len >= 4
+      && buf[0] == 0 && buf[1] == 0 && buf[2] == 0 && buf[3] == 0)
+    return "BINARY";
+
+  /* Check for UTF-32 LE BOM: 0xFF 0xFE 0x00 0x00
+     Must check before UTF-16 LE BOM since it starts with 0xFF 0xFE.  */
+  if (len >= 4
+      && buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00)
+    {
+      *bom_len = 4;
+      return "UTF-32LE";
+    }
+
+  /* Check for UTF-32 BE BOM: 0x00 0x00 0xFE 0xFF.  */
+  if (len >= 4
+      && buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF)
+    {
+      *bom_len = 4;
+      return "UTF-32BE";
+    }
+
+  /* Check for UTF-16 LE BOM: 0xFF 0xFE (not followed by 0x00 0x00).  */
+  if (len >= 2 && buf[0] == 0xFF && buf[1] == 0xFE)
+    {
+      *bom_len = 2;
+      return "UTF-16LE";
+    }
+
+  /* Check for UTF-16 BE BOM: 0xFE 0xFF.  */
+  if (len >= 2 && buf[0] == 0xFE && buf[1] == 0xFF)
+    {
+      *bom_len = 2;
+      return "UTF-16BE";
+    }
+
+  /* Check for UTF-8 BOM: 0xEF 0xBB 0xBF.
+     Note: UTF-8 BOM is handled separately in _cpp_convert_input,
+     so we don't need to strip it here. Just recognize it.  */
+  if (len >= 3 && buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF)
+    {
+      /* UTF-8 BOM is handled by _cpp_convert_input, no need to change charset
+ or strip BOM here.  */
+      return NULL;
+    }
+
+  /* No BOM found. Try to infer encoding from null byte patterns.
+     Only check if we have at least 4 bytes.  */
+  if (len >= 4)
+    {
+      /* UTF-16 LE: 2nd and 4th bytes are zero (for ASCII-range
characters).  */
+      if (buf[1] == 0 && buf[3] == 0 && (buf[0] != 0 || buf[2] != 0))
+ return "UTF-16LE";
+
+      /* UTF-16 BE: 1st and 3rd bytes are zero (for ASCII-range
characters).  */
+      if (buf[0] == 0 && buf[2] == 0 && (buf[1] != 0 || buf[3] != 0))
+ return "UTF-16BE";
+
+      /* UTF-32 LE: 2nd, 3rd, and 4th bytes are zero.  */
+      if (buf[1] == 0 && buf[2] == 0 && buf[3] == 0 && buf[0] != 0)
+ return "UTF-32LE";
+
+      /* UTF-32 BE: 1st, 2nd, and 3rd bytes are zero.  */
+      if (buf[0] == 0 && buf[1] == 0 && buf[2] == 0 && buf[3] != 0)
+ return "UTF-32BE";
+    }
+
+  /* No encoding detected, use the provided charset.  */
+  return NULL;
+}
+
 /* Read a file into FILE->buffer, returning true on success.

    If FILE->fd is something weird, like a block device, we don't want
@@ -795,9 +894,45 @@ read_file_guts (cpp_reader *pfile, _cpp_file
*file, location_t loc,
     cpp_error_at (pfile, CPP_DL_WARNING, loc,
    "%s is shorter than expected", file->path);

+  /* Auto-detect encoding from first 4 bytes if input_charset is not
+     explicitly specified.  */
+  const char *detected_charset = NULL;
+  size_t bom_len = 0;
+
+  if (total >= 1)
+    {
+      detected_charset = detect_encoding (buf, total, &bom_len);
+
+      /* Check for binary file.  */
+      if (detected_charset && strcmp (detected_charset, "BINARY") == 0)
+ {
+   if (pfile)
+     cpp_error_at (pfile, CPP_DL_ERROR, loc,
+   "%s appears to be a binary file", file->path);
+   free (buf);
+   return false;
+ }
+    }
+
+  /* Use detected charset if we found one, otherwise use the provided one.  */
+  const char *effective_charset = detected_charset ? detected_charset
+    : input_charset;
+
+  /* If we have a BOM to skip, adjust the buffer.  */
+  ssize_t convert_len = total;
+
+  if (bom_len > 0 && (size_t)total >= bom_len)
+    {
+      /* Move data to skip the BOM.  We need to adjust the buffer
+ so _cpp_convert_input doesn't see the BOM.  */
+      convert_len = total - bom_len;
+      memmove (buf, buf + bom_len, convert_len);
+    }
+
   file->buffer = _cpp_convert_input (pfile,
-      input_charset,
-      buf, size + pad, total,
+      effective_charset,
+      buf, size + pad,
+      convert_len,
       &file->buffer_start,
       &file->st.st_size);
   file->buffer_valid = file->buffer;
---
Best regards,
Katayama Hirofumi MZ <[email protected]>


_______________________________________________
Mingw-w64-public mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public

[Mingw-w64-public] Add UTF-16 support to C preprocessor

Reply via email to