Script 'mail_helper' called by obssrc
Hello community,

here is the log from the commit of package lua-luautf8 for openSUSE:Factory 
checked in at 2022-12-03 10:03:49
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/lua-luautf8 (Old)
 and      /work/SRC/openSUSE:Factory/.lua-luautf8.new.1835 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Package is "lua-luautf8"

Sat Dec  3 10:03:49 2022 rev:3 rq:1039693 version:0.1.5

Changes:
--------
--- /work/SRC/openSUSE:Factory/lua-luautf8/lua-luautf8.changes  2022-11-20 
19:47:05.221263146 +0100
+++ /work/SRC/openSUSE:Factory/.lua-luautf8.new.1835/lua-luautf8.changes        
2022-12-03 10:04:05.903394285 +0100
@@ -1,0 +2,6 @@
+Sat Dec  3 00:26:27 UTC 2022 - Gordon Leung <[email protected]>
+
+- Update to version 0.1.5:
+ * add clean, isvalid, invalidposition functions
+
+-------------------------------------------------------------------

Old:
----
  luautf8-0.1.4.tar.xz

New:
----
  luautf8-0.1.5.tar.xz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ lua-luautf8.spec ++++++
--- /var/tmp/diff_new_pack.ALqXWn/_old  2022-12-03 10:04:06.343396730 +0100
+++ /var/tmp/diff_new_pack.ALqXWn/_new  2022-12-03 10:04:06.347396752 +0100
@@ -18,7 +18,7 @@
 
 %define flavor @BUILD_FLAVOR@
 %define mod_name luautf8
-%define rock_version 0.1.4-1
+%define rock_version 0.1.5-1
 %ifarch %{ix86}
  %define luarock_arch x86
 %else
@@ -28,7 +28,7 @@
   %define luarock_arch %{_arch}
  %endif
 %endif
-Version:        0.1.4
+Version:        0.1.5
 Release:        0
 Summary:        A utf-8 support module for Lua and LuaJIT
 License:        MIT

++++++ _service ++++++
--- /var/tmp/diff_new_pack.ALqXWn/_old  2022-12-03 10:04:06.399397041 +0100
+++ /var/tmp/diff_new_pack.ALqXWn/_new  2022-12-03 10:04:06.403397063 +0100
@@ -3,7 +3,7 @@
     <param name="url">https://github.com/starwing/luautf8</param>
     <param name="versionformat">@PARENT_TAG@</param>
     <param name="scm">git</param>
-    <param name="revision">a3db9cca0d7d82d78e2acaba2b5571178fcddc01</param>
+    <param name="revision">751c782864f4c636760339e16f218d6dee292d5d</param>
     <param name="versionrewrite-pattern">(\d+.\d+.\d+)</param>
     <param name="versionrewrite-replacement">\1</param>
   </service>

++++++ luautf8-0.1.4.tar.xz -> luautf8-0.1.5.tar.xz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/luautf8-0.1.4/README.md new/luautf8-0.1.5/README.md
--- old/luautf8-0.1.4/README.md 2022-10-01 16:29:35.000000000 +0200
+++ new/luautf8-0.1.5/README.md 2022-12-01 15:56:51.000000000 +0100
@@ -15,11 +15,12 @@
 It mainly used to compatible with Lua's own string module, it passed all
 string and pattern matching test in lua test suite[2].
 
-It also add some useful routines against UTF-8 features, some like:
+It also adds some useful routines against UTF-8 features, such as:
 - a convenient interface to escape Unicode sequence in string. 
 - string insert/remove, since UTF-8 substring extract may expensive.
 - calculate Unicode width, useful when implement e.g. console emulator.
 - a useful interface to translate Unicode offset and byte offset.
+- checking UTF-8 strings for validity and removing invalid byte sequences.
 
 Note that to avoid conflict with the Lua5.3's buitin library 'utf8',
 this library produce a file like lua-utf8.dll or lua-utf8.so. so use
@@ -162,6 +163,24 @@
 compare a and b without case, -1 means a < b, 0 means a == b and 1 means a > b.
 
 
+### utf8.isvalid(s) -> boolean
+check whether s is a valid UTF-8 string or not.
+
+
+### utf8.clean(s[, replacement_string]) -> cleaned_string, was_valid
+replace any invalid UTF-8 byte sequences in s with the replacement string.
+if no replacement string is provided, the default is "�" (REPLACEMENT 
CHARACTER U+FFFD).
+note that *any* number of consecutive invalid bytes will be replaced by a 
single copy of the replacement string.
+the 2nd return value is true if the original string was already valid (meaning 
no replacements were made).
+
+
+### utf8.invalidoffset(s[, init]) -> offset
+return the byte offset within s of the first invalid UTF-8 byte sequence.
+(1 is the first byte of the string.)
+if s is a valid UTF-8 string, return nil.
+the optional numeric argument init specifies where to start the search; its 
default value is 1 and can be negative.
+
+
 Improvement needed
 ------------------
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/luautf8-0.1.4/fuzzer/Makefile 
new/luautf8-0.1.5/fuzzer/Makefile
--- old/luautf8-0.1.4/fuzzer/Makefile   1970-01-01 01:00:00.000000000 +0100
+++ new/luautf8-0.1.5/fuzzer/Makefile   2022-12-01 15:56:51.000000000 +0100
@@ -0,0 +1,13 @@
+ALL: lua-utf8.so fuzz-valid fuzz-clean fuzz-invalid
+
+lua-utf8.so: ../lutf8lib.c
+       gcc -g -fPIC $$(pkg-config --cflags lua5.1) ../lutf8lib.c -shared -o 
lua-utf8.so
+
+fuzz-valid: fuzz-valid.c
+       clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 
-llua5.1 fuzz-valid.c -o fuzz-valid
+
+fuzz-clean: fuzz-clean.c
+       clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 
-llua5.1 fuzz-clean.c -o fuzz-clean
+
+fuzz-invalid: fuzz-invalid.c
+       clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 
-llua5.1 fuzz-invalid.c -o fuzz-invalid
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/luautf8-0.1.4/fuzzer/fuzz-clean.c 
new/luautf8-0.1.5/fuzzer/fuzz-clean.c
--- old/luautf8-0.1.4/fuzzer/fuzz-clean.c       1970-01-01 01:00:00.000000000 
+0100
+++ new/luautf8-0.1.5/fuzzer/fuzz-clean.c       2022-12-01 15:56:51.000000000 
+0100
@@ -0,0 +1,161 @@
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+#include "lua.h"
+#include "lualib.h"
+#include "lauxlib.h"
+
+lua_State *L;
+
+/* Adapted from mb_utf8_to_wchar (from the PHP codebase) */
+static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len)
+{
+       unsigned char *p = in, *e = p + in_len;
+
+       while (p < e) {
+               unsigned char c = *p++;
+
+               if (c < 0x80) {
+                       /* do nothing */
+               } else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */
+                       if (p < e) {
+                               unsigned char c2 = *p++;
+                               if ((c2 & 0xC0) != 0x80) {
+                                       return false;
+                               }
+                       } else {
+                               return false;
+                       }
+               } else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */
+                       if ((e - p) >= 2) {
+                               unsigned char c2 = *p++;
+                               unsigned char c3 = *p++;
+                               if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 
0xA0) || (c == 0xED && c2 >= 0xA0)) {
+                                       return false;
+                               } else if ((c3 & 0xC0) != 0x80) {
+                                       return false;
+                               }
+                       } else {
+                               return false;
+                       }
+               } else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */
+                       if ((e - p) >= 3) {
+                               unsigned char c2 = *p++;
+                               unsigned char c3 = *p++;
+                               unsigned char c4 = *p++;
+                               /* If c == 0xF0 and c2 < 0x90, then this is an 
over-long code unit; it could have
+                                * fit in 3 bytes only. If c == 0xF4 and c2 >= 
0x90, then this codepoint is
+                                * greater than U+10FFFF, which is the highest 
legal codepoint */
+                               if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 
0x90) || (c == 0xF4 && c2 >= 0x90)) {
+                                       return false;
+                               } else if ((c3 & 0xC0) != 0x80) {
+                                       return false;
+                               } else if ((c4 & 0xC0) != 0x80) {
+                                       return false;
+                               }
+                       } else {
+                               return false;
+                       }
+               } else {
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size)
+{
+       lua_getglobal(L, "utf8");
+       lua_getfield(L, -1, "clean");
+
+       const char *orig_data = (const char*)Data;
+
+       uint8_t *Comma = memchr(Data, ',', Size);
+       const char *repl = NULL;
+       size_t repl_len;
+
+       if (Comma) {
+               /* We will pass two arguments (the 2nd one is optional) */
+               lua_pushlstring(L, (const char*)Data, Comma - Data);
+               Size -= Comma - Data + 1;
+               Data = Comma + 1;
+               repl = (const char*)Data;
+               repl_len = Size;
+       }
+
+       lua_pushlstring(L, (const char*)Data, Size);
+
+       size_t input_len = lua_objlen(L, Comma ? -2 : -1);
+
+       /*
+       const char *dbg = lua_tostring(L, Comma ? -2 : -1);
+       printf("Input length = %zu\n", input_len);
+       printf("Input = ");
+       for (int i = 0; i < input_len; i++)
+               printf("%02x", dbg[i] & 0xFF);
+       printf("\n");
+       */
+
+       int err = lua_pcall(L, Comma ? 2 : 1, 2, 0);
+       /* printf("Err = %x\n", err); */
+
+       if (err) {
+               /* utf8.clean raised an error */
+               assert(repl != NULL);
+
+               /*
+               if (err == 2) {
+                       const char *errmsg = lua_tostring(L, -1);
+                       printf("Err message = %s\n", errmsg);
+               }
+
+               printf("Replacement length = %zu\n", repl_len);
+               printf("Replacement = ");
+               for (int i = 0; i < repl_len; i++)
+                       printf("%02x", repl[i] & 0xFF);
+               printf("\n");
+               */
+
+               assert(!php_mbstring_check_utf8((unsigned char*)repl, 
repl_len));
+       } else {
+               assert(lua_isstring(L, -2));
+               assert(lua_isboolean(L, -1));
+               const char *str = lua_tostring(L, -2);
+               int was_clean = lua_toboolean(L, -1);
+               size_t output_len = lua_objlen(L, -2);
+
+               /*
+               printf("Output length = %zu\n", output_len);
+               printf("Output = ");
+               for (int i = 0; i < output_len; i++)
+                       printf("%02x", str[i] & 0xFF);
+               printf("\n");
+               */
+
+               if (was_clean) {
+                       assert(input_len == output_len);
+                       assert(memcmp(orig_data, str, input_len) == 0);
+               } else {
+                       assert(input_len != output_len || memcmp(orig_data, 
str, input_len) != 0);
+               }
+               assert(php_mbstring_check_utf8((unsigned char*)str, 
output_len));
+       }
+
+       lua_settop(L, 0); // clear Lua stack
+
+       return 0;
+}
+
+int LLVMFuzzerInitialize(int *argc, char ***argv)
+{
+       L = luaL_newstate();
+       luaL_openlibs(L);
+       lua_getglobal(L, "require");
+       lua_pushstring(L, "lua-utf8");
+       lua_call(L, 1, 1);
+       lua_setglobal(L, "utf8");
+       return 0;
+}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/luautf8-0.1.4/fuzzer/fuzz-invalid.c 
new/luautf8-0.1.5/fuzzer/fuzz-invalid.c
--- old/luautf8-0.1.4/fuzzer/fuzz-invalid.c     1970-01-01 01:00:00.000000000 
+0100
+++ new/luautf8-0.1.5/fuzzer/fuzz-invalid.c     2022-12-01 15:56:51.000000000 
+0100
@@ -0,0 +1,138 @@
+#include <stdint.h>
+#include <stdbool.h>
+#include <assert.h>
+#include <math.h>
+
+#include "lua.h"
+#include "lualib.h"
+#include "lauxlib.h"
+
+lua_State *L;
+
+/* Adapted from mb_utf8_to_wchar (from the PHP codebase) */
+static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len)
+{
+       unsigned char *p = in, *e = p + in_len;
+
+       while (p < e) {
+               unsigned char c = *p++;
+
+               if (c < 0x80) {
+                       /* do nothing */
+               } else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */
+                       if (p < e) {
+                               unsigned char c2 = *p++;
+                               if ((c2 & 0xC0) != 0x80) {
+                                       return false;
+                               }
+                       } else {
+                               return false;
+                       }
+               } else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */
+                       if ((e - p) >= 2) {
+                               unsigned char c2 = *p++;
+                               unsigned char c3 = *p++;
+                               if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 
0xA0) || (c == 0xED && c2 >= 0xA0)) {
+                                       return false;
+                               } else if ((c3 & 0xC0) != 0x80) {
+                                       return false;
+                               }
+                       } else {
+                               return false;
+                       }
+               } else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */
+                       if ((e - p) >= 3) {
+                               unsigned char c2 = *p++;
+                               unsigned char c3 = *p++;
+                               unsigned char c4 = *p++;
+                               /* If c == 0xF0 and c2 < 0x90, then this is an 
over-long code unit; it could have
+                                * fit in 3 bytes only. If c == 0xF4 and c2 >= 
0x90, then this codepoint is
+                                * greater than U+10FFFF, which is the highest 
legal codepoint */
+                               if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 
0x90) || (c == 0xF4 && c2 >= 0x90)) {
+                                       return false;
+                               } else if ((c3 & 0xC0) != 0x80) {
+                                       return false;
+                               } else if ((c4 & 0xC0) != 0x80) {
+                                       return false;
+                               }
+                       } else {
+                               return false;
+                       }
+               } else {
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size)
+{
+       lua_getglobal(L, "utf8");
+       lua_getfield(L, -1, "invalidoffset");
+
+       int offset = 0;
+       if (Size > 2) {
+               offset = *Data++;
+               if (*Data++ % 2 == 1)
+                       offset = -offset;
+               Size -= 2;
+       }
+
+       lua_pushlstring(L, (const char*)Data, Size);
+       lua_pushinteger(L, offset);
+
+       /*
+       const char *dbg = lua_tostring(L, -2);
+       printf("Input length = %zu\n", Size);
+       printf("Input = ");
+       for (int i = 0; i < Size; i++)
+               printf("%02x", Data[i] & 0xFF);
+       printf("\n");
+       printf("Offset = %d\n", offset);
+       */
+
+       lua_call(L, 2, 1);
+
+       assert(lua_isnumber(L, -1) || lua_isnil(L, -1));
+
+       /* Convert offset into a positive number from 1 - length of string
+        * (offset is 1-based, not 0-based) */
+       if (offset < 0) {
+               offset = Size + offset + 1;
+               if (offset <= 0) {
+                       offset = 1;
+               }
+       } else if (offset == 0) {
+               offset = 1;
+       } else if (offset > Size) {
+               offset = Size + 1;
+       }
+
+       if (lua_isnumber(L, -1)) {
+               double retval = lua_tonumber(L, -1);
+               /* printf("Retval = %d\n", (int)retval); */
+               assert(floor(retval) == ceil(retval)); /* Although 'double', 
it's actually an integer */
+               assert(retval >= offset);
+               assert(retval > 0);
+               assert(retval <= Size);
+               assert(!php_mbstring_check_utf8((unsigned char*)Data + 
(int)retval - 1, Size - (int)retval + 1));
+       } else {
+               assert(php_mbstring_check_utf8((unsigned char*)Data + offset - 
1, Size - offset + 1));
+       }
+
+       lua_settop(L, 0); // clear Lua stack
+
+       return 0;
+}
+
+int LLVMFuzzerInitialize(int *argc, char ***argv)
+{
+       L = luaL_newstate();
+       luaL_openlibs(L);
+       lua_getglobal(L, "require");
+       lua_pushstring(L, "lua-utf8");
+       lua_call(L, 1, 1);
+       lua_setglobal(L, "utf8");
+       return 0;
+}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/luautf8-0.1.4/fuzzer/fuzz-valid.c 
new/luautf8-0.1.5/fuzzer/fuzz-valid.c
--- old/luautf8-0.1.4/fuzzer/fuzz-valid.c       1970-01-01 01:00:00.000000000 
+0100
+++ new/luautf8-0.1.5/fuzzer/fuzz-valid.c       2022-12-01 15:56:51.000000000 
+0100
@@ -0,0 +1,97 @@
+#include <stdint.h>
+#include <stdbool.h>
+#include <assert.h>
+
+#include "lua.h"
+#include "lualib.h"
+#include "lauxlib.h"
+
+lua_State *L;
+
+/* Adapted from mb_utf8_to_wchar (from the PHP codebase) */
+static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len)
+{
+       unsigned char *p = in, *e = p + in_len;
+
+       while (p < e) {
+               unsigned char c = *p++;
+
+               if (c < 0x80) {
+                       /* do nothing */
+               } else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */
+                       if (p < e) {
+                               unsigned char c2 = *p++;
+                               if ((c2 & 0xC0) != 0x80) {
+                                       return false;
+                               }
+                       } else {
+                               return false;
+                       }
+               } else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */
+                       if ((e - p) >= 2) {
+                               unsigned char c2 = *p++;
+                               unsigned char c3 = *p++;
+                               if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 
0xA0) || (c == 0xED && c2 >= 0xA0)) {
+                                       return false;
+                               } else if ((c3 & 0xC0) != 0x80) {
+                                       return false;
+                               }
+                       } else {
+                               return false;
+                       }
+               } else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */
+                       if ((e - p) >= 3) {
+                               unsigned char c2 = *p++;
+                               unsigned char c3 = *p++;
+                               unsigned char c4 = *p++;
+                               /* If c == 0xF0 and c2 < 0x90, then this is an 
over-long code unit; it could have
+                                * fit in 3 bytes only. If c == 0xF4 and c2 >= 
0x90, then this codepoint is
+                                * greater than U+10FFFF, which is the highest 
legal codepoint */
+                               if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 
0x90) || (c == 0xF4 && c2 >= 0x90)) {
+                                       return false;
+                               } else if ((c3 & 0xC0) != 0x80) {
+                                       return false;
+                               } else if ((c4 & 0xC0) != 0x80) {
+                                       return false;
+                               }
+                       } else {
+                               return false;
+                       }
+               } else {
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size)
+{
+       lua_getglobal(L, "utf8");
+       lua_getfield(L, -1, "isvalid");
+       lua_pushlstring(L, (const char*)Data, Size);
+       lua_call(L, 1, 1);
+
+       assert(lua_isboolean(L, -1));
+       int was_valid = lua_toboolean(L, -1);
+       if (was_valid) {
+               assert(php_mbstring_check_utf8((unsigned char*)Data, Size));
+       } else {
+               assert(!php_mbstring_check_utf8((unsigned char*)Data, Size));
+       }
+
+       lua_settop(L, 0); // clear Lua stack
+
+       return 0;
+}
+
+int LLVMFuzzerInitialize(int *argc, char ***argv)
+{
+       L = luaL_newstate();
+       luaL_openlibs(L);
+       lua_getglobal(L, "require");
+       lua_pushstring(L, "lua-utf8");
+       lua_call(L, 1, 1);
+       lua_setglobal(L, "utf8");
+       return 0;
+}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/luautf8-0.1.4/lutf8lib.c new/luautf8-0.1.5/lutf8lib.c
--- old/luautf8-0.1.4/lutf8lib.c        2022-10-01 16:29:35.000000000 +0200
+++ new/luautf8-0.1.5/lutf8lib.c        2022-12-01 15:56:51.000000000 +0100
@@ -4,9 +4,9 @@
 #include <lauxlib.h>
 #include <lualib.h>
 
-
 #include <assert.h>
 #include <string.h>
+#include <stdint.h>
 
 #include "unidata.h"
 
@@ -115,6 +115,54 @@
   return *i < *j;
 }
 
+/* Indexed by top nibble of first byte in code unit */
+static uint8_t utf8_code_unit_len[] = {
+  1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 2, 2, 3, 4
+};
+
+/* Return pointer to first invalid UTF-8 sequence in 's', or NULL if valid */
+static const char *utf8_invalid_offset(const char *s, const char *e) {
+  while (s < e) {
+    uint8_t c = *s;
+    if (c >= 0x80) {
+      /* c < 0xC0 means a continuation byte, but we are not in the middle of a 
multi-byte code unit
+       * c >= 0xC0 && c < 0xC2 means an overlong 2-byte code unit
+       * c >= 0xF8 means a 5-byte or 6-byte code unit, which is illegal, or 
else illegal byte 0xFE/0xFF
+       * c >= 0xF5 && c < 0xF8 means a 4-byte code unit encoding invalid 
codepoint > U+10FFFF */
+      if (c < 0xC2 || c >= 0xF5)
+        return s;
+      uint8_t needed_bytes = utf8_code_unit_len[c >> 4];
+      if (e - s < needed_bytes)
+        return s; /* String is truncated */
+      uint8_t c2 = *(s+1);
+      if ((c2 & 0xC0) != 0x80)
+        return s; /* 2nd byte of code unit is not a continuation byte */
+      if (needed_bytes >= 3) {
+        uint8_t c3 = *(s+2);
+        if ((c3 & 0xC0) != 0x80)
+          return s; /* 3rd byte of code unit is not a continuation byte */
+        if (needed_bytes == 3) {
+          if (c == 0xE0 && c2 < 0xA0)
+            return s; /* Overlong 3-byte code unit */
+          if (c == 0xED && c2 >= 0xA0)
+            return s; /* Reserved codepoint from U+D800-U+DFFF */
+        } else {
+          uint8_t c4 = *(s+3);
+          if ((c4 & 0xC0) != 0x80)
+            return s; /* 4th byte of code unit is not a continuation byte */
+          if (c == 0xF0 && c2 < 0x90)
+            return s; /* Overlong 4-byte code unit */
+          if (c == 0xF4 && c2 >= 0x90)
+            return s; /* Illegal codepoint > U+10FFFF */
+        }
+      }
+      s += needed_bytes;
+    } else {
+      s++;
+    }
+  }
+  return NULL;
+}
 
 /* Unicode character categories */
 
@@ -1239,6 +1287,83 @@
   return 2;
 }
 
+static int Lutf8_isvalid(lua_State *L) {
+  const char *e, *s = check_utf8(L, 1, &e);
+  const char *invalid = utf8_invalid_offset(s, e);
+  lua_pushboolean(L, invalid == NULL);
+  return 1;
+}
+
+static int Lutf8_invalidoffset(lua_State *L) {
+  const char *e, *s = check_utf8(L, 1, &e);
+  const char *orig_s = s;
+  int offset = luaL_optinteger(L, 2, 0);
+  if (offset > 1) {
+    offset--;
+    s += offset;
+    if (s >= e) {
+      lua_pushnil(L);
+      return 1;
+    }
+  } else if (offset < 0 && s - e < offset) {
+    s = e + offset;
+  }
+  const char *invalid = utf8_invalid_offset(s, e);
+  if (invalid == NULL) {
+    lua_pushnil(L);
+  } else {
+    lua_pushinteger(L, invalid - orig_s + 1);
+  }
+  return 1;
+}
+
+static int Lutf8_clean(lua_State *L) {
+  const char *e, *s = check_utf8(L, 1, &e);
+
+  /* Default replacement string is REPLACEMENT CHARACTER U+FFFD */
+  size_t repl_len;
+  const char *r = luaL_optlstring(L, 2, "\xEF\xBF\xBD", &repl_len);
+
+  if (lua_gettop(L) > 1) {
+    /* Check if replacement string is valid UTF-8 or not */
+    if (utf8_invalid_offset(r, r + repl_len) != NULL) {
+      lua_pushstring(L, "replacement string must be valid UTF-8");
+      lua_error(L);
+    }
+  }
+
+  const char *invalid = utf8_invalid_offset(s, e);
+  if (invalid == NULL) {
+    lua_settop(L, 1); /* Return input string without modification */
+    lua_pushboolean(L, 1); /* String was clean already */
+    return 2;
+  }
+
+  luaL_Buffer buff;
+  luaL_buffinit(L, &buff);
+
+  while (1) {
+    /* Invariant: 's' points to first GOOD byte not in output buffer,
+     * 'invalid' points to first BAD byte after that */
+    luaL_addlstring(&buff, s, invalid - s);
+    luaL_addlstring(&buff, r, repl_len);
+    /* We do not replace every bad byte with the replacement character,
+     * but rather a contiguous sequence of bad bytes
+     * Restore the invariant by stepping forward until we find at least
+     * one good byte */
+    s = invalid;
+    while (s == invalid) {
+      s++;
+      invalid = utf8_invalid_offset(s, e);
+    }
+    if (invalid == NULL) {
+      luaL_addlstring(&buff, s, e - s);
+      luaL_pushresult(&buff);
+      lua_pushboolean(L, 0); /* String was not clean */
+      return 2;
+    }
+  }
+}
 
 /* lua module import interface */
 
@@ -1276,6 +1401,9 @@
     ENTRY(gmatch),
     ENTRY(gsub),
     ENTRY(match),
+    ENTRY(isvalid),
+    ENTRY(invalidoffset),
+    ENTRY(clean),
 #undef  ENTRY
     { NULL, NULL }
   };
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/luautf8-0.1.4/rockspecs/luautf8-0.1.4-1.rockspec 
new/luautf8-0.1.5/rockspecs/luautf8-0.1.4-1.rockspec
--- old/luautf8-0.1.4/rockspecs/luautf8-0.1.4-1.rockspec        2022-10-01 
16:29:35.000000000 +0200
+++ new/luautf8-0.1.5/rockspecs/luautf8-0.1.4-1.rockspec        2022-12-01 
15:56:51.000000000 +0100
@@ -1,8 +1,8 @@
 package = "luautf8"
 version = "0.1.4-1"
 source = {
-   url = "https://github.com/starwing/luautf8/archive/0.1.3.tar.gz";,
-   dir = "luautf8-0.1.3"
+   url = "https://github.com/starwing/luautf8/archive/refs/tags/0.1.4.tar.gz";,
+   dir = "luautf8-0.1.4"
 }
 description = {
    summary = "A UTF-8 support module for Lua",
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/luautf8-0.1.4/rockspecs/luautf8-0.1.5-1.rockspec 
new/luautf8-0.1.5/rockspecs/luautf8-0.1.5-1.rockspec
--- old/luautf8-0.1.4/rockspecs/luautf8-0.1.5-1.rockspec        1970-01-01 
01:00:00.000000000 +0100
+++ new/luautf8-0.1.5/rockspecs/luautf8-0.1.5-1.rockspec        2022-12-01 
15:56:51.000000000 +0100
@@ -0,0 +1,23 @@
+package = "luautf8"
+version = "0.1.5-1"
+source = {
+   url = "https://github.com/starwing/luautf8/archive/refs/tags/0.1.4.tar.gz";,
+   dir = "luautf8-0.1.4"
+}
+description = {
+   summary = "A UTF-8 support module for Lua",
+   detailed = [[
+   This module adds UTF-8 support to Lua. It's compatible with Lua "string" 
module.
+]],
+   homepage = "http://github.com/starwing/luautf8";,
+   license = "MIT"
+}
+dependencies = {
+   "lua >= 5.1"
+}
+build = {
+   type = "builtin",
+   modules = {
+      ["lua-utf8"] = "lutf8lib.c"
+   }
+}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/luautf8-0.1.4/test.lua new/luautf8-0.1.5/test.lua
--- old/luautf8-0.1.4/test.lua  2022-10-01 16:29:35.000000000 +0200
+++ new/luautf8-0.1.5/test.lua  2022-12-01 15:56:51.000000000 +0100
@@ -221,6 +221,139 @@
 end
 
 
+-- test isvalid
+local good_strings = {
+   '',
+   'A',
+   'abcdefghijklmnopqrstuvwxyz',
+   "``",
+   "@",
+   'नमस्ते',
+   '中国',
+   '日本語01234567890。',
+   'ひらがな',
+   'Καλημέρα',
+   'АБВГ',
+   '⡌⠁⠧⠑ ⠼',
+   '∑ f(i)',
+   'Οὐχὶ ταὐτὰ παρίσταταί μοι 
γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι, ὅταν τ᾿ 
εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς 
τοὺς',
+   'ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789 abcdefghijklmnopqrstuvwxyz 
£©µÀÆÖÞßéöÿАБВГДабвгд∀∂∈ℝ∧∪≡∞ 
↑↗↨↻⇣',
+   'გთხოვთ ახლავე გაიაროთ რ
ეგისტრაცია Unicode-ის მეათე საერ
თაშორისო კონფერენციაზე 
დასასწრებად, რომელიც გაიმარ
თება 10-12 მარტს',
+   '\000' -- NUL is valid in UTF-8
+}
+
+for _, good in ipairs(good_strings) do
+   assert(utf8.isvalid(good))
+end
+
+assert(not utf8.isvalid("\255")) -- illegal byte 0xFF
+assert(not utf8.isvalid("abc\254def")) -- illegal byte 0xFE
+
+assert(not utf8.isvalid("123 \223")) -- truncated code unit 0xDF
+assert(not utf8.isvalid("123 \239\191")) -- truncated code unit 0xEF BF
+assert(not utf8.isvalid("123 \240\191")) -- truncated code unit 0xF0 BF
+assert(not utf8.isvalid("123 \240\191\191")) -- truncated code unit 0xF0 BF BF
+
+assert(not utf8.isvalid('\223ABC')) -- code unit 0xDF ended too soon and went 
to ASCII
+assert(not utf8.isvalid('\239\191ABC')) -- code unit 0xEF BF ended too soon 
and went to ASCII
+assert(not utf8.isvalid('\240\191ABC')) -- code unit 0xF0 BF ended too soon 
and went to ASCII
+assert(not utf8.isvalid('\240\191\191ABC')) -- code unit 0xF0 BF BF ended too 
soon and went to ASCII
+
+assert(not utf8.isvalid('\223中')) -- code unit 0xDF ended too soon and went 
to another multi-byte char
+assert(not utf8.isvalid('\239\191中')) -- code unit 0xEF BF ended too soon 
and went to another multi-byte char
+assert(not utf8.isvalid('\240\191中')) -- code unit 0xF0 BF ended too soon 
and went to another multi-byte char
+assert(not utf8.isvalid('\240\191\191中')) -- code unit 0xF0 BF BF ended too 
soon and went to another multi-byte char
+
+assert(utf8.isvalid('\237\159\191')) -- U+D7FF is valid
+assert(not utf8.isvalid('\237\160\128')) -- U+D800; reserved for UTF-16 
surrogate
+assert(not utf8.isvalid('\237\175\191')) -- U+DBFF; reserved for UTF-16 
surrogate
+assert(not utf8.isvalid('\237\191\191')) -- U+DFFF; reserved for UTF-16 
surrogate
+assert(utf8.isvalid('\238\128\128')) -- U+E000 is valid
+
+assert(utf8.isvalid('\244\143\191\191')) -- U+10FFFF is valid
+assert(not utf8.isvalid('\244\144\128\128')) -- U+110000 is not valid
+assert(not utf8.isvalid('\247\191\191\191')) -- U+1FFFFF is not valid
+
+assert(not utf8.isvalid('\128')) -- continuation byte outside a multi-byte char
+assert(not utf8.isvalid('A\128A')) -- continuation byte outside a multi-byte 
char
+assert(not utf8.isvalid('中\128')) -- continuation byte outside a multi-byte 
char
+
+assert(not utf8.isvalid('\193\191')) -- overlong code unit
+assert(not utf8.isvalid('\224\159\191')) -- overlong code unit
+assert(not utf8.isvalid('\240\143\191\191')) -- overlong code unit
+
+-- test clean
+local cleaned, was_clean
+
+for _, good in ipairs(good_strings) do
+   cleaned, was_clean = utf8.clean(good)
+   assert(cleaned == good)
+   assert(was_clean)
+end
+
+cleaned, was_clean = utf8.clean('A\128A')
+assert(cleaned == 'A�A')
+assert(not was_clean)
+
+cleaned, was_clean = utf8.clean('\128')
+assert(cleaned == '�')
+assert(not was_clean)
+
+cleaned, was_clean = utf8.clean('1\193\1912\224\159\1913\240\143\191\191', 
'???')
+assert(cleaned == '1???2???3???')
+assert(not was_clean)
+
+cleaned, was_clean = utf8.clean('\237\160\128\237\175\191\237\191\191')
+assert(cleaned == '�') -- an entire sequence of bad bytes just gets replaced 
with one replacement char
+assert(not was_clean)
+
+cleaned, was_clean = utf8.clean('123 \223', '')
+assert(cleaned == '123 ')
+assert(not was_clean)
+
+cleaned, was_clean = utf8.clean('\239\191中', '')
+assert(cleaned == '中')
+assert(not was_clean)
+
+assert_error(function() utf8.clean('abc', '\255') end, "replacement string 
must be valid UTF%-8")
+
+
+-- test invalidoffset
+for _, good in ipairs(good_strings) do
+   assert(utf8.invalidoffset(good) == nil)
+end
+
+assert(utf8.invalidoffset("\255") == 1)
+assert(utf8.invalidoffset("\255", 0) == 1)
+assert(utf8.invalidoffset("\255", 1) == 1)
+assert(utf8.invalidoffset("\255", 2) == nil)
+assert(utf8.invalidoffset("\255", -1) == 1)
+assert(utf8.invalidoffset("\255", -2) == 1)
+assert(utf8.invalidoffset("\255", -3) == 1)
+
+assert(utf8.invalidoffset("abc\254def") == 4)
+assert(utf8.invalidoffset("abc\254def", 0) == 4)
+assert(utf8.invalidoffset("abc\254def", 1) == 4)
+assert(utf8.invalidoffset("abc\254def", 2) == 4)
+assert(utf8.invalidoffset("abc\254def", 3) == 4)
+assert(utf8.invalidoffset("abc\254def", 4) == 4)
+assert(utf8.invalidoffset("abc\254def", 5) == nil)
+assert(utf8.invalidoffset("abc\254def", 6) == nil)
+assert(utf8.invalidoffset("abc\254def", -1) == nil)
+assert(utf8.invalidoffset("abc\254def", -2) == nil)
+assert(utf8.invalidoffset("abc\254def", -3) == nil)
+assert(utf8.invalidoffset("abc\254def", -4) == 4)
+assert(utf8.invalidoffset("abc\254def", -5) == 4)
+
+assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 0) == 1)
+assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 1) == 1)
+assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 2) == 2)
+assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 3) == 3)
+assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 4) == 4)
+assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 5) == 5)
+assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 6) == 6)
+assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', -1) == 9)
+
 print "OK"
 
 -- cc: run='lua -- $input'

Reply via email to