This simplifies the interface for other UTF-8 validity detections when a
simple "yes" or "no" answer is sufficient.

Signed-off-by: Ben Boeckel <ben.boec...@kitware.com>
---
 libcpp/ChangeLog  |  6 ++++++
 libcpp/charset.cc | 18 ++++++++++++++++++
 libcpp/internal.h |  2 ++
 3 files changed, 26 insertions(+)

diff --git a/libcpp/ChangeLog b/libcpp/ChangeLog
index 4d707277531..4e2c7900ae2 100644
--- a/libcpp/ChangeLog
+++ b/libcpp/ChangeLog
@@ -1,3 +1,9 @@
+2022-10-27  Ben Boeckel  <ben.boec...@kitware.com>
+
+       * include/charset.cc: Add `_cpp_valid_utf8_str` which determines
+       whether a C string is valid UTF-8 or not.
+       * include/internal.h: Add prototype for `_cpp_valid_utf8_str`.
+
 2022-10-27  Ben Boeckel  <ben.boec...@kitware.com>
 
        * include/charset.cc: Reject encodings of codepoints above 0x10FFFF.
diff --git a/libcpp/charset.cc b/libcpp/charset.cc
index e9da6674b5f..0524ab6beba 100644
--- a/libcpp/charset.cc
+++ b/libcpp/charset.cc
@@ -1864,6 +1864,24 @@ _cpp_valid_utf8 (cpp_reader *pfile,
   return true;
 }
 
+extern bool
+_cpp_valid_utf8_str (const char *name)
+{
+  const uchar* in = (const uchar*)name;
+  size_t len = strlen(name);
+  cppchar_t cp;
+
+  while (*in)
+    {
+      if (one_utf8_to_cppchar(&in, &len, &cp))
+       {
+         return false;
+       }
+    }
+
+  return true;
+}
+
 /* Subroutine of convert_hex and convert_oct.  N is the representation
    in the execution character set of a numeric escape; write it into the
    string buffer TBUF and update the end-of-string pointer therein.  WIDE
diff --git a/libcpp/internal.h b/libcpp/internal.h
index badfd1b40da..4f2dd4a2f5c 100644
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -834,6 +834,8 @@ extern bool _cpp_valid_utf8 (cpp_reader *pfile,
                             struct normalize_state *nst,
                             cppchar_t *cp);
 
+extern bool _cpp_valid_utf8_str (const char *str);
+
 extern void _cpp_destroy_iconv (cpp_reader *);
 extern unsigned char *_cpp_convert_input (cpp_reader *, const char *,
                                          unsigned char *, size_t, size_t,
-- 
2.37.3

Reply via email to