Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package lua-luautf8 for openSUSE:Factory checked in at 2022-12-03 10:03:49 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/lua-luautf8 (Old) and /work/SRC/openSUSE:Factory/.lua-luautf8.new.1835 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "lua-luautf8" Sat Dec 3 10:03:49 2022 rev:3 rq:1039693 version:0.1.5 Changes: -------- --- /work/SRC/openSUSE:Factory/lua-luautf8/lua-luautf8.changes 2022-11-20 19:47:05.221263146 +0100 +++ /work/SRC/openSUSE:Factory/.lua-luautf8.new.1835/lua-luautf8.changes 2022-12-03 10:04:05.903394285 +0100 @@ -1,0 +2,6 @@ +Sat Dec 3 00:26:27 UTC 2022 - Gordon Leung <[email protected]> + +- Update to version 0.1.5: + * add clean, isvalid, invalidposition functions + +------------------------------------------------------------------- Old: ---- luautf8-0.1.4.tar.xz New: ---- luautf8-0.1.5.tar.xz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ lua-luautf8.spec ++++++ --- /var/tmp/diff_new_pack.ALqXWn/_old 2022-12-03 10:04:06.343396730 +0100 +++ /var/tmp/diff_new_pack.ALqXWn/_new 2022-12-03 10:04:06.347396752 +0100 @@ -18,7 +18,7 @@ %define flavor @BUILD_FLAVOR@ %define mod_name luautf8 -%define rock_version 0.1.4-1 +%define rock_version 0.1.5-1 %ifarch %{ix86} %define luarock_arch x86 %else @@ -28,7 +28,7 @@ %define luarock_arch %{_arch} %endif %endif -Version: 0.1.4 +Version: 0.1.5 Release: 0 Summary: A utf-8 support module for Lua and LuaJIT License: MIT ++++++ _service ++++++ --- /var/tmp/diff_new_pack.ALqXWn/_old 2022-12-03 10:04:06.399397041 +0100 +++ /var/tmp/diff_new_pack.ALqXWn/_new 2022-12-03 10:04:06.403397063 +0100 @@ -3,7 +3,7 @@ <param name="url">https://github.com/starwing/luautf8</param> <param name="versionformat">@PARENT_TAG@</param> <param name="scm">git</param> - <param name="revision">a3db9cca0d7d82d78e2acaba2b5571178fcddc01</param> + <param name="revision">751c782864f4c636760339e16f218d6dee292d5d</param> <param name="versionrewrite-pattern">(\d+.\d+.\d+)</param> <param name="versionrewrite-replacement">\1</param> </service> ++++++ luautf8-0.1.4.tar.xz -> luautf8-0.1.5.tar.xz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/luautf8-0.1.4/README.md new/luautf8-0.1.5/README.md --- old/luautf8-0.1.4/README.md 2022-10-01 16:29:35.000000000 +0200 +++ new/luautf8-0.1.5/README.md 2022-12-01 15:56:51.000000000 +0100 @@ -15,11 +15,12 @@ It mainly used to compatible with Lua's own string module, it passed all string and pattern matching test in lua test suite[2]. -It also add some useful routines against UTF-8 features, some like: +It also adds some useful routines against UTF-8 features, such as: - a convenient interface to escape Unicode sequence in string. - string insert/remove, since UTF-8 substring extract may expensive. - calculate Unicode width, useful when implement e.g. console emulator. - a useful interface to translate Unicode offset and byte offset. +- checking UTF-8 strings for validity and removing invalid byte sequences. Note that to avoid conflict with the Lua5.3's buitin library 'utf8', this library produce a file like lua-utf8.dll or lua-utf8.so. so use @@ -162,6 +163,24 @@ compare a and b without case, -1 means a < b, 0 means a == b and 1 means a > b. +### utf8.isvalid(s) -> boolean +check whether s is a valid UTF-8 string or not. + + +### utf8.clean(s[, replacement_string]) -> cleaned_string, was_valid +replace any invalid UTF-8 byte sequences in s with the replacement string. +if no replacement string is provided, the default is "�" (REPLACEMENT CHARACTER U+FFFD). +note that *any* number of consecutive invalid bytes will be replaced by a single copy of the replacement string. +the 2nd return value is true if the original string was already valid (meaning no replacements were made). + + +### utf8.invalidoffset(s[, init]) -> offset +return the byte offset within s of the first invalid UTF-8 byte sequence. +(1 is the first byte of the string.) +if s is a valid UTF-8 string, return nil. +the optional numeric argument init specifies where to start the search; its default value is 1 and can be negative. + + Improvement needed ------------------ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/luautf8-0.1.4/fuzzer/Makefile new/luautf8-0.1.5/fuzzer/Makefile --- old/luautf8-0.1.4/fuzzer/Makefile 1970-01-01 01:00:00.000000000 +0100 +++ new/luautf8-0.1.5/fuzzer/Makefile 2022-12-01 15:56:51.000000000 +0100 @@ -0,0 +1,13 @@ +ALL: lua-utf8.so fuzz-valid fuzz-clean fuzz-invalid + +lua-utf8.so: ../lutf8lib.c + gcc -g -fPIC $$(pkg-config --cflags lua5.1) ../lutf8lib.c -shared -o lua-utf8.so + +fuzz-valid: fuzz-valid.c + clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 -llua5.1 fuzz-valid.c -o fuzz-valid + +fuzz-clean: fuzz-clean.c + clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 -llua5.1 fuzz-clean.c -o fuzz-clean + +fuzz-invalid: fuzz-invalid.c + clang -g -fsanitize=address,fuzzer,undefined -I/usr/include/lua5.1 -llua5.1 fuzz-invalid.c -o fuzz-invalid diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/luautf8-0.1.4/fuzzer/fuzz-clean.c new/luautf8-0.1.5/fuzzer/fuzz-clean.c --- old/luautf8-0.1.4/fuzzer/fuzz-clean.c 1970-01-01 01:00:00.000000000 +0100 +++ new/luautf8-0.1.5/fuzzer/fuzz-clean.c 2022-12-01 15:56:51.000000000 +0100 @@ -0,0 +1,161 @@ +#include <stdint.h> +#include <stdbool.h> +#include <string.h> +#include <assert.h> + +#include "lua.h" +#include "lualib.h" +#include "lauxlib.h" + +lua_State *L; + +/* Adapted from mb_utf8_to_wchar (from the PHP codebase) */ +static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + + while (p < e) { + unsigned char c = *p++; + + if (c < 0x80) { + /* do nothing */ + } else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */ + if (p < e) { + unsigned char c2 = *p++; + if ((c2 & 0xC0) != 0x80) { + return false; + } + } else { + return false; + } + } else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */ + if ((e - p) >= 2) { + unsigned char c2 = *p++; + unsigned char c3 = *p++; + if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) { + return false; + } else if ((c3 & 0xC0) != 0x80) { + return false; + } + } else { + return false; + } + } else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */ + if ((e - p) >= 3) { + unsigned char c2 = *p++; + unsigned char c3 = *p++; + unsigned char c4 = *p++; + /* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have + * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is + * greater than U+10FFFF, which is the highest legal codepoint */ + if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) { + return false; + } else if ((c3 & 0xC0) != 0x80) { + return false; + } else if ((c4 & 0xC0) != 0x80) { + return false; + } + } else { + return false; + } + } else { + return false; + } + } + + return true; +} + +int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) +{ + lua_getglobal(L, "utf8"); + lua_getfield(L, -1, "clean"); + + const char *orig_data = (const char*)Data; + + uint8_t *Comma = memchr(Data, ',', Size); + const char *repl = NULL; + size_t repl_len; + + if (Comma) { + /* We will pass two arguments (the 2nd one is optional) */ + lua_pushlstring(L, (const char*)Data, Comma - Data); + Size -= Comma - Data + 1; + Data = Comma + 1; + repl = (const char*)Data; + repl_len = Size; + } + + lua_pushlstring(L, (const char*)Data, Size); + + size_t input_len = lua_objlen(L, Comma ? -2 : -1); + + /* + const char *dbg = lua_tostring(L, Comma ? -2 : -1); + printf("Input length = %zu\n", input_len); + printf("Input = "); + for (int i = 0; i < input_len; i++) + printf("%02x", dbg[i] & 0xFF); + printf("\n"); + */ + + int err = lua_pcall(L, Comma ? 2 : 1, 2, 0); + /* printf("Err = %x\n", err); */ + + if (err) { + /* utf8.clean raised an error */ + assert(repl != NULL); + + /* + if (err == 2) { + const char *errmsg = lua_tostring(L, -1); + printf("Err message = %s\n", errmsg); + } + + printf("Replacement length = %zu\n", repl_len); + printf("Replacement = "); + for (int i = 0; i < repl_len; i++) + printf("%02x", repl[i] & 0xFF); + printf("\n"); + */ + + assert(!php_mbstring_check_utf8((unsigned char*)repl, repl_len)); + } else { + assert(lua_isstring(L, -2)); + assert(lua_isboolean(L, -1)); + const char *str = lua_tostring(L, -2); + int was_clean = lua_toboolean(L, -1); + size_t output_len = lua_objlen(L, -2); + + /* + printf("Output length = %zu\n", output_len); + printf("Output = "); + for (int i = 0; i < output_len; i++) + printf("%02x", str[i] & 0xFF); + printf("\n"); + */ + + if (was_clean) { + assert(input_len == output_len); + assert(memcmp(orig_data, str, input_len) == 0); + } else { + assert(input_len != output_len || memcmp(orig_data, str, input_len) != 0); + } + assert(php_mbstring_check_utf8((unsigned char*)str, output_len)); + } + + lua_settop(L, 0); // clear Lua stack + + return 0; +} + +int LLVMFuzzerInitialize(int *argc, char ***argv) +{ + L = luaL_newstate(); + luaL_openlibs(L); + lua_getglobal(L, "require"); + lua_pushstring(L, "lua-utf8"); + lua_call(L, 1, 1); + lua_setglobal(L, "utf8"); + return 0; +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/luautf8-0.1.4/fuzzer/fuzz-invalid.c new/luautf8-0.1.5/fuzzer/fuzz-invalid.c --- old/luautf8-0.1.4/fuzzer/fuzz-invalid.c 1970-01-01 01:00:00.000000000 +0100 +++ new/luautf8-0.1.5/fuzzer/fuzz-invalid.c 2022-12-01 15:56:51.000000000 +0100 @@ -0,0 +1,138 @@ +#include <stdint.h> +#include <stdbool.h> +#include <assert.h> +#include <math.h> + +#include "lua.h" +#include "lualib.h" +#include "lauxlib.h" + +lua_State *L; + +/* Adapted from mb_utf8_to_wchar (from the PHP codebase) */ +static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + + while (p < e) { + unsigned char c = *p++; + + if (c < 0x80) { + /* do nothing */ + } else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */ + if (p < e) { + unsigned char c2 = *p++; + if ((c2 & 0xC0) != 0x80) { + return false; + } + } else { + return false; + } + } else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */ + if ((e - p) >= 2) { + unsigned char c2 = *p++; + unsigned char c3 = *p++; + if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) { + return false; + } else if ((c3 & 0xC0) != 0x80) { + return false; + } + } else { + return false; + } + } else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */ + if ((e - p) >= 3) { + unsigned char c2 = *p++; + unsigned char c3 = *p++; + unsigned char c4 = *p++; + /* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have + * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is + * greater than U+10FFFF, which is the highest legal codepoint */ + if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) { + return false; + } else if ((c3 & 0xC0) != 0x80) { + return false; + } else if ((c4 & 0xC0) != 0x80) { + return false; + } + } else { + return false; + } + } else { + return false; + } + } + + return true; +} + +int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) +{ + lua_getglobal(L, "utf8"); + lua_getfield(L, -1, "invalidoffset"); + + int offset = 0; + if (Size > 2) { + offset = *Data++; + if (*Data++ % 2 == 1) + offset = -offset; + Size -= 2; + } + + lua_pushlstring(L, (const char*)Data, Size); + lua_pushinteger(L, offset); + + /* + const char *dbg = lua_tostring(L, -2); + printf("Input length = %zu\n", Size); + printf("Input = "); + for (int i = 0; i < Size; i++) + printf("%02x", Data[i] & 0xFF); + printf("\n"); + printf("Offset = %d\n", offset); + */ + + lua_call(L, 2, 1); + + assert(lua_isnumber(L, -1) || lua_isnil(L, -1)); + + /* Convert offset into a positive number from 1 - length of string + * (offset is 1-based, not 0-based) */ + if (offset < 0) { + offset = Size + offset + 1; + if (offset <= 0) { + offset = 1; + } + } else if (offset == 0) { + offset = 1; + } else if (offset > Size) { + offset = Size + 1; + } + + if (lua_isnumber(L, -1)) { + double retval = lua_tonumber(L, -1); + /* printf("Retval = %d\n", (int)retval); */ + assert(floor(retval) == ceil(retval)); /* Although 'double', it's actually an integer */ + assert(retval >= offset); + assert(retval > 0); + assert(retval <= Size); + assert(!php_mbstring_check_utf8((unsigned char*)Data + (int)retval - 1, Size - (int)retval + 1)); + } else { + assert(php_mbstring_check_utf8((unsigned char*)Data + offset - 1, Size - offset + 1)); + } + + lua_settop(L, 0); // clear Lua stack + + return 0; +} + +int LLVMFuzzerInitialize(int *argc, char ***argv) +{ + L = luaL_newstate(); + luaL_openlibs(L); + lua_getglobal(L, "require"); + lua_pushstring(L, "lua-utf8"); + lua_call(L, 1, 1); + lua_setglobal(L, "utf8"); + return 0; +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/luautf8-0.1.4/fuzzer/fuzz-valid.c new/luautf8-0.1.5/fuzzer/fuzz-valid.c --- old/luautf8-0.1.4/fuzzer/fuzz-valid.c 1970-01-01 01:00:00.000000000 +0100 +++ new/luautf8-0.1.5/fuzzer/fuzz-valid.c 2022-12-01 15:56:51.000000000 +0100 @@ -0,0 +1,97 @@ +#include <stdint.h> +#include <stdbool.h> +#include <assert.h> + +#include "lua.h" +#include "lualib.h" +#include "lauxlib.h" + +lua_State *L; + +/* Adapted from mb_utf8_to_wchar (from the PHP codebase) */ +static bool php_mbstring_check_utf8(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + + while (p < e) { + unsigned char c = *p++; + + if (c < 0x80) { + /* do nothing */ + } else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */ + if (p < e) { + unsigned char c2 = *p++; + if ((c2 & 0xC0) != 0x80) { + return false; + } + } else { + return false; + } + } else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */ + if ((e - p) >= 2) { + unsigned char c2 = *p++; + unsigned char c3 = *p++; + if ((c2 & 0xC0) != 0x80 || (c == 0xE0 && c2 < 0xA0) || (c == 0xED && c2 >= 0xA0)) { + return false; + } else if ((c3 & 0xC0) != 0x80) { + return false; + } + } else { + return false; + } + } else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */ + if ((e - p) >= 3) { + unsigned char c2 = *p++; + unsigned char c3 = *p++; + unsigned char c4 = *p++; + /* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have + * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is + * greater than U+10FFFF, which is the highest legal codepoint */ + if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) { + return false; + } else if ((c3 & 0xC0) != 0x80) { + return false; + } else if ((c4 & 0xC0) != 0x80) { + return false; + } + } else { + return false; + } + } else { + return false; + } + } + + return true; +} + +int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) +{ + lua_getglobal(L, "utf8"); + lua_getfield(L, -1, "isvalid"); + lua_pushlstring(L, (const char*)Data, Size); + lua_call(L, 1, 1); + + assert(lua_isboolean(L, -1)); + int was_valid = lua_toboolean(L, -1); + if (was_valid) { + assert(php_mbstring_check_utf8((unsigned char*)Data, Size)); + } else { + assert(!php_mbstring_check_utf8((unsigned char*)Data, Size)); + } + + lua_settop(L, 0); // clear Lua stack + + return 0; +} + +int LLVMFuzzerInitialize(int *argc, char ***argv) +{ + L = luaL_newstate(); + luaL_openlibs(L); + lua_getglobal(L, "require"); + lua_pushstring(L, "lua-utf8"); + lua_call(L, 1, 1); + lua_setglobal(L, "utf8"); + return 0; +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/luautf8-0.1.4/lutf8lib.c new/luautf8-0.1.5/lutf8lib.c --- old/luautf8-0.1.4/lutf8lib.c 2022-10-01 16:29:35.000000000 +0200 +++ new/luautf8-0.1.5/lutf8lib.c 2022-12-01 15:56:51.000000000 +0100 @@ -4,9 +4,9 @@ #include <lauxlib.h> #include <lualib.h> - #include <assert.h> #include <string.h> +#include <stdint.h> #include "unidata.h" @@ -115,6 +115,54 @@ return *i < *j; } +/* Indexed by top nibble of first byte in code unit */ +static uint8_t utf8_code_unit_len[] = { + 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 2, 2, 3, 4 +}; + +/* Return pointer to first invalid UTF-8 sequence in 's', or NULL if valid */ +static const char *utf8_invalid_offset(const char *s, const char *e) { + while (s < e) { + uint8_t c = *s; + if (c >= 0x80) { + /* c < 0xC0 means a continuation byte, but we are not in the middle of a multi-byte code unit + * c >= 0xC0 && c < 0xC2 means an overlong 2-byte code unit + * c >= 0xF8 means a 5-byte or 6-byte code unit, which is illegal, or else illegal byte 0xFE/0xFF + * c >= 0xF5 && c < 0xF8 means a 4-byte code unit encoding invalid codepoint > U+10FFFF */ + if (c < 0xC2 || c >= 0xF5) + return s; + uint8_t needed_bytes = utf8_code_unit_len[c >> 4]; + if (e - s < needed_bytes) + return s; /* String is truncated */ + uint8_t c2 = *(s+1); + if ((c2 & 0xC0) != 0x80) + return s; /* 2nd byte of code unit is not a continuation byte */ + if (needed_bytes >= 3) { + uint8_t c3 = *(s+2); + if ((c3 & 0xC0) != 0x80) + return s; /* 3rd byte of code unit is not a continuation byte */ + if (needed_bytes == 3) { + if (c == 0xE0 && c2 < 0xA0) + return s; /* Overlong 3-byte code unit */ + if (c == 0xED && c2 >= 0xA0) + return s; /* Reserved codepoint from U+D800-U+DFFF */ + } else { + uint8_t c4 = *(s+3); + if ((c4 & 0xC0) != 0x80) + return s; /* 4th byte of code unit is not a continuation byte */ + if (c == 0xF0 && c2 < 0x90) + return s; /* Overlong 4-byte code unit */ + if (c == 0xF4 && c2 >= 0x90) + return s; /* Illegal codepoint > U+10FFFF */ + } + } + s += needed_bytes; + } else { + s++; + } + } + return NULL; +} /* Unicode character categories */ @@ -1239,6 +1287,83 @@ return 2; } +static int Lutf8_isvalid(lua_State *L) { + const char *e, *s = check_utf8(L, 1, &e); + const char *invalid = utf8_invalid_offset(s, e); + lua_pushboolean(L, invalid == NULL); + return 1; +} + +static int Lutf8_invalidoffset(lua_State *L) { + const char *e, *s = check_utf8(L, 1, &e); + const char *orig_s = s; + int offset = luaL_optinteger(L, 2, 0); + if (offset > 1) { + offset--; + s += offset; + if (s >= e) { + lua_pushnil(L); + return 1; + } + } else if (offset < 0 && s - e < offset) { + s = e + offset; + } + const char *invalid = utf8_invalid_offset(s, e); + if (invalid == NULL) { + lua_pushnil(L); + } else { + lua_pushinteger(L, invalid - orig_s + 1); + } + return 1; +} + +static int Lutf8_clean(lua_State *L) { + const char *e, *s = check_utf8(L, 1, &e); + + /* Default replacement string is REPLACEMENT CHARACTER U+FFFD */ + size_t repl_len; + const char *r = luaL_optlstring(L, 2, "\xEF\xBF\xBD", &repl_len); + + if (lua_gettop(L) > 1) { + /* Check if replacement string is valid UTF-8 or not */ + if (utf8_invalid_offset(r, r + repl_len) != NULL) { + lua_pushstring(L, "replacement string must be valid UTF-8"); + lua_error(L); + } + } + + const char *invalid = utf8_invalid_offset(s, e); + if (invalid == NULL) { + lua_settop(L, 1); /* Return input string without modification */ + lua_pushboolean(L, 1); /* String was clean already */ + return 2; + } + + luaL_Buffer buff; + luaL_buffinit(L, &buff); + + while (1) { + /* Invariant: 's' points to first GOOD byte not in output buffer, + * 'invalid' points to first BAD byte after that */ + luaL_addlstring(&buff, s, invalid - s); + luaL_addlstring(&buff, r, repl_len); + /* We do not replace every bad byte with the replacement character, + * but rather a contiguous sequence of bad bytes + * Restore the invariant by stepping forward until we find at least + * one good byte */ + s = invalid; + while (s == invalid) { + s++; + invalid = utf8_invalid_offset(s, e); + } + if (invalid == NULL) { + luaL_addlstring(&buff, s, e - s); + luaL_pushresult(&buff); + lua_pushboolean(L, 0); /* String was not clean */ + return 2; + } + } +} /* lua module import interface */ @@ -1276,6 +1401,9 @@ ENTRY(gmatch), ENTRY(gsub), ENTRY(match), + ENTRY(isvalid), + ENTRY(invalidoffset), + ENTRY(clean), #undef ENTRY { NULL, NULL } }; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/luautf8-0.1.4/rockspecs/luautf8-0.1.4-1.rockspec new/luautf8-0.1.5/rockspecs/luautf8-0.1.4-1.rockspec --- old/luautf8-0.1.4/rockspecs/luautf8-0.1.4-1.rockspec 2022-10-01 16:29:35.000000000 +0200 +++ new/luautf8-0.1.5/rockspecs/luautf8-0.1.4-1.rockspec 2022-12-01 15:56:51.000000000 +0100 @@ -1,8 +1,8 @@ package = "luautf8" version = "0.1.4-1" source = { - url = "https://github.com/starwing/luautf8/archive/0.1.3.tar.gz", - dir = "luautf8-0.1.3" + url = "https://github.com/starwing/luautf8/archive/refs/tags/0.1.4.tar.gz", + dir = "luautf8-0.1.4" } description = { summary = "A UTF-8 support module for Lua", diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/luautf8-0.1.4/rockspecs/luautf8-0.1.5-1.rockspec new/luautf8-0.1.5/rockspecs/luautf8-0.1.5-1.rockspec --- old/luautf8-0.1.4/rockspecs/luautf8-0.1.5-1.rockspec 1970-01-01 01:00:00.000000000 +0100 +++ new/luautf8-0.1.5/rockspecs/luautf8-0.1.5-1.rockspec 2022-12-01 15:56:51.000000000 +0100 @@ -0,0 +1,23 @@ +package = "luautf8" +version = "0.1.5-1" +source = { + url = "https://github.com/starwing/luautf8/archive/refs/tags/0.1.4.tar.gz", + dir = "luautf8-0.1.4" +} +description = { + summary = "A UTF-8 support module for Lua", + detailed = [[ + This module adds UTF-8 support to Lua. It's compatible with Lua "string" module. +]], + homepage = "http://github.com/starwing/luautf8", + license = "MIT" +} +dependencies = { + "lua >= 5.1" +} +build = { + type = "builtin", + modules = { + ["lua-utf8"] = "lutf8lib.c" + } +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/luautf8-0.1.4/test.lua new/luautf8-0.1.5/test.lua --- old/luautf8-0.1.4/test.lua 2022-10-01 16:29:35.000000000 +0200 +++ new/luautf8-0.1.5/test.lua 2022-12-01 15:56:51.000000000 +0100 @@ -221,6 +221,139 @@ end +-- test isvalid +local good_strings = { + '', + 'A', + 'abcdefghijklmnopqrstuvwxyz', + "``", + "@", + 'नमसà¥à¤¤à¥', + 'ä¸å½', + 'æ¥æ¬èªï¼ï¼ï¼ï¼ï¼ï¼ï¼ï¼ï¼ï¼ï¼ã', + 'ã²ãããª', + 'ÎαλημÎÏα', + 'ÐÐÐÐ', + 'â¡â â §â â ¼', + 'â f(i)', + 'Îá½Ïá½¶ Ïαá½Ïá½° ÏαÏá½·ÏÏαÏαί μοι γιγνώÏκειν, ὦ á¼Î½Î´ÏÎµÏ á¾¿Îθηναá¿Î¿Î¹, á½ Ïαν Ï᾿ Îµá¼°Ï Ïá½° ÏÏάγμαÏα á¼ÏοβλέÏÏ ÎºÎ±á½¶ á½ Ïαν ÏÏá½¸Ï ÏοὺÏ', + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789 abcdefghijklmnopqrstuvwxyz £©µÃÃÃÃÃéöÿÐÐÐÐÐабвгдâââââ§âªâ¡â âââ¨â»â£', + 'ááá®ááá áá®áááá ááááá áá á áááá¡á¢á ááªáá Unicode-áá¡ ááááá á¡ááá ááá¨áá áá¡á áááá¤áá áááªáááá ááá¡áá¡á¬á áááá, á ááááá᪠áááááá áááá 10-12 ááá á¢á¡', + '\000' -- NUL is valid in UTF-8 +} + +for _, good in ipairs(good_strings) do + assert(utf8.isvalid(good)) +end + +assert(not utf8.isvalid("\255")) -- illegal byte 0xFF +assert(not utf8.isvalid("abc\254def")) -- illegal byte 0xFE + +assert(not utf8.isvalid("123 \223")) -- truncated code unit 0xDF +assert(not utf8.isvalid("123 \239\191")) -- truncated code unit 0xEF BF +assert(not utf8.isvalid("123 \240\191")) -- truncated code unit 0xF0 BF +assert(not utf8.isvalid("123 \240\191\191")) -- truncated code unit 0xF0 BF BF + +assert(not utf8.isvalid('\223ABC')) -- code unit 0xDF ended too soon and went to ASCII +assert(not utf8.isvalid('\239\191ABC')) -- code unit 0xEF BF ended too soon and went to ASCII +assert(not utf8.isvalid('\240\191ABC')) -- code unit 0xF0 BF ended too soon and went to ASCII +assert(not utf8.isvalid('\240\191\191ABC')) -- code unit 0xF0 BF BF ended too soon and went to ASCII + +assert(not utf8.isvalid('\223ä¸')) -- code unit 0xDF ended too soon and went to another multi-byte char +assert(not utf8.isvalid('\239\191ä¸')) -- code unit 0xEF BF ended too soon and went to another multi-byte char +assert(not utf8.isvalid('\240\191ä¸')) -- code unit 0xF0 BF ended too soon and went to another multi-byte char +assert(not utf8.isvalid('\240\191\191ä¸')) -- code unit 0xF0 BF BF ended too soon and went to another multi-byte char + +assert(utf8.isvalid('\237\159\191')) -- U+D7FF is valid +assert(not utf8.isvalid('\237\160\128')) -- U+D800; reserved for UTF-16 surrogate +assert(not utf8.isvalid('\237\175\191')) -- U+DBFF; reserved for UTF-16 surrogate +assert(not utf8.isvalid('\237\191\191')) -- U+DFFF; reserved for UTF-16 surrogate +assert(utf8.isvalid('\238\128\128')) -- U+E000 is valid + +assert(utf8.isvalid('\244\143\191\191')) -- U+10FFFF is valid +assert(not utf8.isvalid('\244\144\128\128')) -- U+110000 is not valid +assert(not utf8.isvalid('\247\191\191\191')) -- U+1FFFFF is not valid + +assert(not utf8.isvalid('\128')) -- continuation byte outside a multi-byte char +assert(not utf8.isvalid('A\128A')) -- continuation byte outside a multi-byte char +assert(not utf8.isvalid('ä¸\128')) -- continuation byte outside a multi-byte char + +assert(not utf8.isvalid('\193\191')) -- overlong code unit +assert(not utf8.isvalid('\224\159\191')) -- overlong code unit +assert(not utf8.isvalid('\240\143\191\191')) -- overlong code unit + +-- test clean +local cleaned, was_clean + +for _, good in ipairs(good_strings) do + cleaned, was_clean = utf8.clean(good) + assert(cleaned == good) + assert(was_clean) +end + +cleaned, was_clean = utf8.clean('A\128A') +assert(cleaned == 'A�A') +assert(not was_clean) + +cleaned, was_clean = utf8.clean('\128') +assert(cleaned == '�') +assert(not was_clean) + +cleaned, was_clean = utf8.clean('1\193\1912\224\159\1913\240\143\191\191', '???') +assert(cleaned == '1???2???3???') +assert(not was_clean) + +cleaned, was_clean = utf8.clean('\237\160\128\237\175\191\237\191\191') +assert(cleaned == '�') -- an entire sequence of bad bytes just gets replaced with one replacement char +assert(not was_clean) + +cleaned, was_clean = utf8.clean('123 \223', '') +assert(cleaned == '123 ') +assert(not was_clean) + +cleaned, was_clean = utf8.clean('\239\191ä¸', '') +assert(cleaned == 'ä¸') +assert(not was_clean) + +assert_error(function() utf8.clean('abc', '\255') end, "replacement string must be valid UTF%-8") + + +-- test invalidoffset +for _, good in ipairs(good_strings) do + assert(utf8.invalidoffset(good) == nil) +end + +assert(utf8.invalidoffset("\255") == 1) +assert(utf8.invalidoffset("\255", 0) == 1) +assert(utf8.invalidoffset("\255", 1) == 1) +assert(utf8.invalidoffset("\255", 2) == nil) +assert(utf8.invalidoffset("\255", -1) == 1) +assert(utf8.invalidoffset("\255", -2) == 1) +assert(utf8.invalidoffset("\255", -3) == 1) + +assert(utf8.invalidoffset("abc\254def") == 4) +assert(utf8.invalidoffset("abc\254def", 0) == 4) +assert(utf8.invalidoffset("abc\254def", 1) == 4) +assert(utf8.invalidoffset("abc\254def", 2) == 4) +assert(utf8.invalidoffset("abc\254def", 3) == 4) +assert(utf8.invalidoffset("abc\254def", 4) == 4) +assert(utf8.invalidoffset("abc\254def", 5) == nil) +assert(utf8.invalidoffset("abc\254def", 6) == nil) +assert(utf8.invalidoffset("abc\254def", -1) == nil) +assert(utf8.invalidoffset("abc\254def", -2) == nil) +assert(utf8.invalidoffset("abc\254def", -3) == nil) +assert(utf8.invalidoffset("abc\254def", -4) == 4) +assert(utf8.invalidoffset("abc\254def", -5) == 4) + +assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 0) == 1) +assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 1) == 1) +assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 2) == 2) +assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 3) == 3) +assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 4) == 4) +assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 5) == 5) +assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', 6) == 6) +assert(utf8.invalidoffset('\237\160\128\237\175\191\237\191\191', -1) == 9) + print "OK" -- cc: run='lua -- $input'
