Hello, I'm katahiromz. Thank you for your great software.
I want to add UTF-16/UTF-32 support to your C preprocessor.
This patch to gcc might add automatic character encoding detection
to `libcpp/files.cc` by examining the first 4 bytes of input files.
I hope this patch helps.
---
Technical information:
**Detection logic in `read_file_guts`:**
- Binary files (all zeros in first 4 bytes) --> error
- BOM detection:
- `FF FE 00 00` --> UTF-32LE
- `00 00 FE FF` --> UTF-32BE
- `FF FE` --> UTF-16LE
- `FE FF` --> UTF-16BE
- `EF BB BF` --> UTF-8 (handled by existing code)
- Null byte pattern inference (no BOM):
- bytes[1]==0 && bytes[3]==0 --> UTF-16LE
- bytes[0]==0 && bytes[2]==0 --> UTF-16BE
- bytes[1,2,3]==0 --> UTF-32LE
- bytes[0,1,2]==0 --> UTF-32BE
**Changes:**
- Added `detect_encoding()` function for BOM/pattern detection
- Modified `read_file_guts()` to auto-detect and strip BOM before conversion
Files less than 4 bytes are processed normally without inference.
---
diff --git a/libcpp/files.cc b/libcpp/files.cc
index d80c4bfd907..2a10752691a 100644
--- a/libcpp/files.cc
+++ b/libcpp/files.cc
@@ -710,6 +710,105 @@ _cpp_find_file (cpp_reader *pfile, const char
*fname, cpp_dir *start_dir,
return file;
}
+/* Detect input file encoding from first 4 bytes and return the charset name.
+ Also returns the BOM length to skip in *BOM_LEN.
+
+ Detection logic:
+ 1) If first 4 bytes are all zero -> binary file (returns NULL)
+ 2) BOM detection:
+ - UTF-32 LE BOM: 0xFF 0xFE 0x00 0x00 -> "UTF-32LE"
+ - UTF-32 BE BOM: 0x00 0x00 0xFE 0xFF -> "UTF-32BE"
+ - UTF-16 LE BOM: 0xFF 0xFE (not followed by 0x00 0x00) -> "UTF-16LE"
+ - UTF-16 BE BOM: 0xFE 0xFF -> "UTF-16BE"
+ - UTF-8 BOM: 0xEF 0xBB 0xBF -> "UTF-8"
+ 3) Null byte pattern detection (no BOM):
+ - bytes[1]==0 && bytes[3]==0 -> "UTF-16LE"
+ - bytes[0]==0 && bytes[2]==0 -> "UTF-16BE"
+ - bytes[2]==0 && bytes[3]==0 -> "UTF-32LE"
+ - bytes[0]==0 && bytes[1]==0 -> "UTF-32BE"
+ 4) Otherwise, return NULL (use the provided input_charset).
+
+ If file is less than 4 bytes, only applicable checks are performed. */
+
+static const char *
+detect_encoding (const uchar *buf, ssize_t len, size_t *bom_len)
+{
+ *bom_len = 0;
+
+ if (len < 1)
+ return NULL;
+
+ /* Check for binary file (all first 4 bytes are zero). */
+ if (len >= 4
+ && buf[0] == 0 && buf[1] == 0 && buf[2] == 0 && buf[3] == 0)
+ return "BINARY";
+
+ /* Check for UTF-32 LE BOM: 0xFF 0xFE 0x00 0x00
+ Must check before UTF-16 LE BOM since it starts with 0xFF 0xFE. */
+ if (len >= 4
+ && buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00)
+ {
+ *bom_len = 4;
+ return "UTF-32LE";
+ }
+
+ /* Check for UTF-32 BE BOM: 0x00 0x00 0xFE 0xFF. */
+ if (len >= 4
+ && buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF)
+ {
+ *bom_len = 4;
+ return "UTF-32BE";
+ }
+
+ /* Check for UTF-16 LE BOM: 0xFF 0xFE (not followed by 0x00 0x00). */
+ if (len >= 2 && buf[0] == 0xFF && buf[1] == 0xFE)
+ {
+ *bom_len = 2;
+ return "UTF-16LE";
+ }
+
+ /* Check for UTF-16 BE BOM: 0xFE 0xFF. */
+ if (len >= 2 && buf[0] == 0xFE && buf[1] == 0xFF)
+ {
+ *bom_len = 2;
+ return "UTF-16BE";
+ }
+
+ /* Check for UTF-8 BOM: 0xEF 0xBB 0xBF.
+ Note: UTF-8 BOM is handled separately in _cpp_convert_input,
+ so we don't need to strip it here. Just recognize it. */
+ if (len >= 3 && buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF)
+ {
+ /* UTF-8 BOM is handled by _cpp_convert_input, no need to change charset
+ or strip BOM here. */
+ return NULL;
+ }
+
+ /* No BOM found. Try to infer encoding from null byte patterns.
+ Only check if we have at least 4 bytes. */
+ if (len >= 4)
+ {
+ /* UTF-16 LE: 2nd and 4th bytes are zero (for ASCII-range
characters). */
+ if (buf[1] == 0 && buf[3] == 0 && (buf[0] != 0 || buf[2] != 0))
+ return "UTF-16LE";
+
+ /* UTF-16 BE: 1st and 3rd bytes are zero (for ASCII-range
characters). */
+ if (buf[0] == 0 && buf[2] == 0 && (buf[1] != 0 || buf[3] != 0))
+ return "UTF-16BE";
+
+ /* UTF-32 LE: 2nd, 3rd, and 4th bytes are zero. */
+ if (buf[1] == 0 && buf[2] == 0 && buf[3] == 0 && buf[0] != 0)
+ return "UTF-32LE";
+
+ /* UTF-32 BE: 1st, 2nd, and 3rd bytes are zero. */
+ if (buf[0] == 0 && buf[1] == 0 && buf[2] == 0 && buf[3] != 0)
+ return "UTF-32BE";
+ }
+
+ /* No encoding detected, use the provided charset. */
+ return NULL;
+}
+
/* Read a file into FILE->buffer, returning true on success.
If FILE->fd is something weird, like a block device, we don't want
@@ -795,9 +894,45 @@ read_file_guts (cpp_reader *pfile, _cpp_file
*file, location_t loc,
cpp_error_at (pfile, CPP_DL_WARNING, loc,
"%s is shorter than expected", file->path);
+ /* Auto-detect encoding from first 4 bytes if input_charset is not
+ explicitly specified. */
+ const char *detected_charset = NULL;
+ size_t bom_len = 0;
+
+ if (total >= 1)
+ {
+ detected_charset = detect_encoding (buf, total, &bom_len);
+
+ /* Check for binary file. */
+ if (detected_charset && strcmp (detected_charset, "BINARY") == 0)
+ {
+ if (pfile)
+ cpp_error_at (pfile, CPP_DL_ERROR, loc,
+ "%s appears to be a binary file", file->path);
+ free (buf);
+ return false;
+ }
+ }
+
+ /* Use detected charset if we found one, otherwise use the provided one. */
+ const char *effective_charset = detected_charset ? detected_charset
+ : input_charset;
+
+ /* If we have a BOM to skip, adjust the buffer. */
+ ssize_t convert_len = total;
+
+ if (bom_len > 0 && (size_t)total >= bom_len)
+ {
+ /* Move data to skip the BOM. We need to adjust the buffer
+ so _cpp_convert_input doesn't see the BOM. */
+ convert_len = total - bom_len;
+ memmove (buf, buf + bom_len, convert_len);
+ }
+
file->buffer = _cpp_convert_input (pfile,
- input_charset,
- buf, size + pad, total,
+ effective_charset,
+ buf, size + pad,
+ convert_len,
&file->buffer_start,
&file->st.st_size);
file->buffer_valid = file->buffer;
---
Best regards,
Katayama Hirofumi MZ <[email protected]>
_______________________________________________
Mingw-w64-public mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public