https://www.mediawiki.org/wiki/Special:Code/MediaWiki/114650
Revision: 114650
Author: vvv
Date: 2012-04-02 08:53:16 +0000 (Mon, 02 Apr 2012)
Log Message:
-----------
Add the initial implementation of the Unicode support.
Modified Paths:
--------------
trunk/php/luasandbox/config.m4
trunk/php/luasandbox/data_conversion.c
trunk/php/luasandbox/library.c
Added Paths:
-----------
trunk/php/luasandbox/luasandbox_unicode.h
trunk/php/luasandbox/m4/ac_check_icu.m4
trunk/php/luasandbox/ustring.c
Modified: trunk/php/luasandbox/config.m4
===================================================================
--- trunk/php/luasandbox/config.m4 2012-04-02 02:06:53 UTC (rev 114649)
+++ trunk/php/luasandbox/config.m4 2012-04-02 08:53:16 UTC (rev 114650)
@@ -9,6 +9,10 @@
if test "$PHP_LUASANDBOX" != "no"; then
dnl Include pkg-config macros definitions:
m4_include([m4/pkg.m4])
+
+ dnl ICU did not support pkg-config till recently; current WM version
+ dnl probably does not support it as well
+ m4_include([m4/ac_check_icu.m4])
PKG_PROG_PKG_CONFIG
dnl We need lua or fallback to luajit.
@@ -19,12 +23,17 @@
])
])
+ AC_CHECK_ICU( [4.0] )
+
dnl LUA_LIBS and LUA_CFLAGS interprets them:
PHP_EVAL_INCLINE($LUA_CFLAGS)
PHP_EVAL_LIBLINE($LUA_LIBS, LUASANDBOX_SHARED_LIBADD)
-
+
+ PHP_EVAL_INCLINE($ICU_CFLAGS)
+ PHP_EVAL_LIBLINE($ICU_LIBS, LUASANDBOX_SHARED_LIBADD)
+
PHP_EVAL_LIBLINE("-lrt", LUASANDBOX_SHARED_LIBADD)
PHP_SUBST(LUASANDBOX_SHARED_LIBADD)
- PHP_NEW_EXTENSION(luasandbox, alloc.c data_conversion.c library.c
luasandbox.c timer.c, $ext_shared)
+ PHP_NEW_EXTENSION(luasandbox, alloc.c data_conversion.c library.c
luasandbox.c timer.c ustring.c, $ext_shared)
fi
Modified: trunk/php/luasandbox/data_conversion.c
===================================================================
--- trunk/php/luasandbox/data_conversion.c 2012-04-02 02:06:53 UTC (rev
114649)
+++ trunk/php/luasandbox/data_conversion.c 2012-04-02 08:53:16 UTC (rev
114650)
@@ -10,6 +10,7 @@
#include "php.h"
#include "php_luasandbox.h"
+#include "luasandbox_unicode.h"
static void luasandbox_lua_to_array(HashTable *ht, lua_State *L, int index,
zval * sandbox_zval, HashTable * recursionGuard TSRMLS_DC);
@@ -305,6 +306,13 @@
break;
}
case LUA_TUSERDATA:
+ if(luasandbox_isustr(L, index)) {
+ const uint8_t *str;
+ size_t length;
+ str = luasandbox_getustr(L, index, &length);
+ ZVAL_STRINGL(z, str, length, 1);
+ break;
+ }
case LUA_TTHREAD:
case LUA_TLIGHTUSERDATA:
default:
Modified: trunk/php/luasandbox/library.c
===================================================================
--- trunk/php/luasandbox/library.c 2012-04-02 02:06:53 UTC (rev 114649)
+++ trunk/php/luasandbox/library.c 2012-04-02 08:53:16 UTC (rev 114650)
@@ -14,6 +14,7 @@
#include "php.h"
#include "php_luasandbox.h"
+#include "luasandbox_unicode.h"
static HashTable * luasandbox_lib_get_allowed_globals(TSRMLS_D);
@@ -128,6 +129,9 @@
lua_pushcfunction(L, luasandbox_math_randomseed);
lua_setfield(L, -2, "randomseed");
lua_pop(L, 1);
+
+ // Install string-related functions
+ luasandbox_install_unicode_functions(L);
}
/* }}} */
Added: trunk/php/luasandbox/luasandbox_unicode.h
===================================================================
--- trunk/php/luasandbox/luasandbox_unicode.h (rev 0)
+++ trunk/php/luasandbox/luasandbox_unicode.h 2012-04-02 08:53:16 UTC (rev
114650)
@@ -0,0 +1,30 @@
+#ifndef LUASANDBOX_UNICODE_H
+#define LUASANDBOX_UNICODE_H
+
+#include <stdint.h>
+#include <lua.h>
+
+/**
+ * Unicode string are input and stored as UTF-8.
+ */
+typedef struct {
+ size_t raw_len; // Byte length in UTF-8
+ int32_t cp_len; // Amount of code points
+} luasandbox_ustr_header;
+
+#define LUASANDBOX_USTR_RAW(header) ((uint8_t*) ( ((void*)header) +
sizeof(luasandbox_ustr_header) ))
+#define LUASANDBOX_USTR_TOTALLEN(header) ( sizeof(luasandbox_ustr_header) +
header->raw_len )
+
+void luasandbox_install_unicode_functions(lua_State * L);
+
+luasandbox_ustr_header *luasandbox_init_ustr(lua_State * L, size_t len);
+luasandbox_ustr_header *luasandbox_push_ustr(lua_State * L, uint8_t *str,
size_t len);
+int luasandbox_isustr(lua_State * L, int idx);
+luasandbox_ustr_header* luasandbox_checkustring(lua_State * L, int idx);
+const uint8_t* luasandbox_getustr(lua_State * L, int idx, size_t* raw_len);
+int32_t luasandbox_ustr_index_to_offset(lua_State * L, luasandbox_ustr_header
*str, int32_t idx, int check_limits);
+
+void luasandbox_convert_toUTF16(lua_State * L, int idx);
+void luasandbox_convert_fromUTF16(lua_State * L, int idx);
+
+#endif
Property changes on: trunk/php/luasandbox/luasandbox_unicode.h
___________________________________________________________________
Added: svn:keywords
+ Author Date Id Rev URL
Added: svn:eol-style
+ native
Added: trunk/php/luasandbox/m4/ac_check_icu.m4
===================================================================
--- trunk/php/luasandbox/m4/ac_check_icu.m4 (rev 0)
+++ trunk/php/luasandbox/m4/ac_check_icu.m4 2012-04-02 08:53:16 UTC (rev
114650)
@@ -0,0 +1,62 @@
+dnl @synopsis AC_CHECK_ICU(version, action-if, action-if-not)
+dnl
+dnl @summary check for ICU of sufficient version by looking at icu-config
+dnl
+dnl Defines ICU_LIBS, ICU_CFLAGS, ICU_CXXFLAGS. See icu-config(1) man
+dnl page.
+dnl
+dnl @category InstalledPackages
+dnl @author Akos Maroy <[email protected]>
+dnl @version 2005-09-20
+dnl @license AllPermissive
+
+AC_DEFUN([AC_CHECK_ICU], [
+ succeeded=no
+
+ if test -z "$ICU_CONFIG"; then
+ AC_PATH_PROG(ICU_CONFIG, icu-config, no)
+ fi
+
+ if test "$ICU_CONFIG" = "no" ; then
+ echo "*** The icu-config script could not be found. Make sure it is"
+ echo "*** in your path, and that taglib is properly installed."
+ echo "*** Or see http://ibm.com/software/globalization/icu/"
+ else
+ ICU_VERSION=`$ICU_CONFIG --version`
+ AC_MSG_CHECKING(for ICU >= $1)
+ VERSION_CHECK=`expr $ICU_VERSION \>\= $1`
+ if test "$VERSION_CHECK" = "1" ; then
+ AC_MSG_RESULT(yes)
+ succeeded=yes
+
+ AC_MSG_CHECKING(ICU_CFLAGS)
+ ICU_CFLAGS=`$ICU_CONFIG --cflags`
+ AC_MSG_RESULT($ICU_CFLAGS)
+
+ AC_MSG_CHECKING(ICU_CXXFLAGS)
+ ICU_CXXFLAGS=`$ICU_CONFIG --cxxflags`
+ AC_MSG_RESULT($ICU_CXXFLAGS)
+
+ AC_MSG_CHECKING(ICU_LIBS)
+ ICU_LIBS=`$ICU_CONFIG --ldflags`
+ AC_MSG_RESULT($ICU_LIBS)
+ else
+ ICU_CFLAGS=""
+ ICU_CXXFLAGS=""
+ ICU_LIBS=""
+ ## If we have a custom action on failure, don't print errors, but
+ ## do set a variable so people can do so.
+ ifelse([$3], ,echo "can't find ICU >= $1",)
+ fi
+
+ AC_SUBST(ICU_CFLAGS)
+ AC_SUBST(ICU_CXXFLAGS)
+ AC_SUBST(ICU_LIBS)
+ fi
+
+ if test $succeeded = yes; then
+ ifelse([$2], , :, [$2])
+ else
+ ifelse([$3], , AC_MSG_ERROR([Library requirements (ICU) not met.]), [$3])
+ fi
+])
Added: trunk/php/luasandbox/ustring.c
===================================================================
--- trunk/php/luasandbox/ustring.c (rev 0)
+++ trunk/php/luasandbox/ustring.c 2012-04-02 08:53:16 UTC (rev 114650)
@@ -0,0 +1,1038 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <lua.h>
+#include <lauxlib.h>
+
+#include <unicode/utf.h>
+#include <unicode/uchar.h>
+#include <unicode/ustring.h>
+
+#include "php.h"
+#include "php_luasandbox.h"
+#include "luasandbox_unicode.h"
+
+#define LUASANDBOX_CHECK_ICU_ERROR(errorCode, cleanupCode) { \
+ if( U_FAILURE(errorCode) ) { \
+ char _luasandbox_errmsg[1024]; \
+ snprintf( _luasandbox_errmsg, 1024, "Unicode
handling error: %s", u_errorName(errorCode) ); \
+ lua_pushstring( L, _luasandbox_errmsg ); \
+ cleanupCode; \
+ lua_error(L); \
+ } \
+ errorCode = U_ZERO_ERROR; \
+ }
+
+/****************** Prototypes ******************/
+
+int luasandbox_ustr_create(lua_State * L);
+int luasandbox_ustr_len(lua_State * L);
+int luasandbox_ustr_concat(lua_State * L);
+int luasandbox_ustr_eq(lua_State * L);
+int luasandbox_ustr_index(lua_State * L);
+
+int luasandbox_ustr_ucfirst(lua_State * L);
+int luasandbox_ustr_uc(lua_State * L);
+int luasandbox_ustr_lc(lua_State * L);
+int luasandbox_ustr_tc(lua_State * L);
+int luasandbox_ustr_trim(lua_State * L);
+int luasandbox_ustr_sub(lua_State * L);
+int luasandbox_ustr_pos(lua_State * L);
+int luasandbox_ustr_replace(lua_State * L);
+int luasandbox_ustr_split(lua_State * L);
+
+/****************** Registration of functions ******************/
+
+static luaL_Reg luasandbox_ustr_functions[] = {
+ { "len", luasandbox_ustr_len },
+ { "ucfirst", luasandbox_ustr_ucfirst },
+ { "uc", luasandbox_ustr_uc },
+ { "lc", luasandbox_ustr_lc },
+ { "tc", luasandbox_ustr_tc },
+ { "trim", luasandbox_ustr_trim },
+ { "sub", luasandbox_ustr_sub },
+ { "pos", luasandbox_ustr_pos },
+ { "replace", luasandbox_ustr_replace },
+ { "split", luasandbox_ustr_split },
+ NULL
+};
+
+/** {{{ luasandbox_install_unicode_functions
+ *
+ * Installs the unicode module into the global namespace.
+ */
+void luasandbox_install_unicode_functions(lua_State * L)
+{
+ luaL_newmetatable( L, "luasandbox_ustr" );
+
+ lua_pushstring( L, "__len" );
+ lua_pushcfunction( L, luasandbox_ustr_len );
+ lua_rawset( L, -3 );
+
+ lua_pushstring( L, "__concat" );
+ lua_pushcfunction( L, luasandbox_ustr_concat );
+ lua_rawset( L, -3 );
+
+ lua_pushstring( L, "__eq" );
+ lua_pushcfunction( L, luasandbox_ustr_eq );
+ lua_rawset( L, -3 );
+
+ lua_pushstring( L, "__index" );
+ lua_pushcfunction( L, luasandbox_ustr_index );
+ lua_rawset( L, -3 );
+
+ lua_pushcfunction( L, luasandbox_ustr_create );
+ lua_setglobal( L, "u" );
+
+ luaL_register( L, "ustring", luasandbox_ustr_functions );
+}
+/* }}} */
+
+/****************** Common functions ******************/
+
+/** {{{ luasandbox_init_ustr
+ *
+ * Initializes a ustring header and assigns the metatable to it.
+ */
+luasandbox_ustr_header *luasandbox_init_ustr(lua_State * L, size_t len)
+{
+ luasandbox_ustr_header *result;
+
+ result = (luasandbox_ustr_header*) lua_newuserdata( L,
sizeof(luasandbox_ustr_header) + len );
+ result->raw_len = len;
+
+ luaL_getmetatable( L, "luasandbox_ustr" );
+ lua_setmetatable( L, -2 );
+
+ return result;
+}
+/* }}} */
+
+/** {{{ luasandbox_push_ustr
+ *
+ * Constructs the ustring object from a UTF-8 string. Validates the string and
+ * raises an error if the string is invalid.
+ */
+luasandbox_ustr_header *luasandbox_push_ustr(lua_State * L, uint8_t *str,
size_t len)
+{
+ luasandbox_ustr_header *header;
+ int32_t i, cp_len;
+
+ // Validate the string + calculate length
+ for( i = cp_len = 0; i < len; cp_len++ ) {
+ UChar32 cur;
+
+ U8_NEXT( str, i, len, cur );
+ if( cur < 0 ) {
+ lua_pushstring( L, "Invalid UTF-8 supplied" );
+ lua_error( L );
+ }
+ }
+
+ header = luasandbox_init_ustr( L, len );
+ header->cp_len = cp_len;
+ memcpy( LUASANDBOX_USTR_RAW(header), str, len );
+
+ return header;
+}
+/* }}} */
+
+/** {{{ luasandbox_isustr
+ *
+ * Checks if the the object on the stack is a ustring.
+ */
+int luasandbox_isustr(lua_State * L, int idx)
+{
+ int result;
+
+ if( lua_type( L, idx ) != LUA_TUSERDATA )
+ return FALSE;
+
+ if( !lua_getmetatable( L, idx ) )
+ return FALSE;
+
+ luaL_getmetatable( L, "luasandbox_ustr" );
+
+ result = lua_equal( L, -1, -2 );
+ lua_pop( L, 2 );
+ return result;
+}
+/* }}} */
+
+/** {{{ luasandbox_checkustring
+ *
+ * Checks whether the specified object on the stack is a ustring
+ * or an object which may be converted to it. Returns the pointer
+ * to the ustring's header.
+ */
+luasandbox_ustr_header* luasandbox_checkustring(lua_State * L, int idx)
+{
+ if ( lua_type( L, idx ) == LUA_TSTRING || lua_type( L, idx ) ==
LUA_TNUMBER ) {
+ // A usual string. Magically convert it to ustring.
+ lua_checkstack( L, 2 );
+ lua_pushvalue( L, idx );
+ luasandbox_ustr_create(L);
+ lua_replace( L, idx );
+ lua_pop( L, 1 );
+ }
+
+ return luaL_checkudata( L, idx, "luasandbox_ustr" );
+}
+/* }}} */
+
+/** {{{ luasandbox_checkustring
+ *
+ * Returns the pointer to the string itself and sets raw_len
+ * to the length of string in bytes.
+ */
+const uint8_t* luasandbox_getustr(lua_State * L, int idx, size_t* raw_len)
+{
+ luasandbox_ustr_header *header;
+ header = luasandbox_checkustring( L, idx );
+ *raw_len = header->raw_len;
+ return LUASANDBOX_USTR_RAW(header);
+}
+/* }}} */
+
+/** {{{ luasandbox_ustr_index_to_offset
+ *
+ * Converts a Lua index (starting with 1) to a C offset (starting with 0).
+ * Handles negative indexes as indexes numbered from the end of the string.
+ */
+int32_t luasandbox_ustr_index_to_offset(lua_State * L, luasandbox_ustr_header
*str, int32_t idx, int check_limits)
+{
+ if( !idx || check_limits && (idx > str->cp_len || -idx > str->cp_len) )
{
+ lua_pushfstring( L, "Trying to access invalid index %d for
string with length %d", idx, str->cp_len );
+ lua_error( L );
+ }
+
+ if( idx > 0 ) {
+ return idx - 1;
+ } else {
+ return str->cp_len + idx;
+ }
+}
+/* }}} */
+
+/****************** Conversions ******************/
+
+/** {{{ luasandbox_convert_toUTF16
+ *
+ * Converts the specified ustring to UTF-16, and pushes
+ * the resulting UTF-16 string on the top of the stack.
+ */
+void luasandbox_convert_toUTF16(lua_State * L, int idx)
+{
+ luasandbox_ustr_header *header;
+ UChar *utf16_string;
+ int32_t result_len;
+ UErrorCode error_code = U_ZERO_ERROR;
+
+ header = luasandbox_checkustring( L, idx );
+
+ utf16_string = emalloc( header->raw_len * 2 );
+ u_strFromUTF8( utf16_string, header->raw_len, &result_len,
+ LUASANDBOX_USTR_RAW(header), header->raw_len, &error_code );
+ LUASANDBOX_CHECK_ICU_ERROR( error_code, efree( utf16_string ) );
+
+ lua_pushlstring( L, (char*)utf16_string, result_len * 2 );
+ efree( utf16_string );
+}
+/* }}} */
+
+/** {{{ luasandbox_convert_fromUTF16
+ *
+ * Converts the specified UTF-16 string to UTF-8, and pushes
+ * the resulting ustring on the top of the stack.
+ */
+void luasandbox_convert_fromUTF16(lua_State * L, int idx)
+{
+ luasandbox_ustr_header *header;
+ uint8_t *utf8_string;
+ UChar *utf16_string;
+ size_t orig_len;
+ int32_t result_len;
+ UErrorCode error_code = U_ZERO_ERROR;
+
+ utf16_string = (UChar*) lua_tolstring( L, idx, &orig_len );
+
+ utf8_string = emalloc( orig_len );
+ u_strToUTF8( utf8_string, orig_len, &result_len,
+ utf16_string, orig_len / 2, &error_code );
+ LUASANDBOX_CHECK_ICU_ERROR( error_code, efree( utf8_string ) );
+
+ luasandbox_push_ustr( L, utf8_string, result_len );
+ efree( utf8_string );
+}
+/* }}} */
+
+/****************** Operators ******************/
+
+/** {{{ luasandbox_ustr_create
+ *
+ * Initializes the Unicode string from the string on the top of the stack.
+ */
+int luasandbox_ustr_create(lua_State * L)
+{
+ uint8_t *str;
+ size_t raw_len = 0;
+
+ str = luaL_checklstring( L, -1, &raw_len );
+ luasandbox_push_ustr( L, str, raw_len );
+ return 1;
+}
+/* }}} */
+
+/** {{{ luasandbox_ustr_len
+ *
+ * Lua function providing the length of the string.
+ */
+int luasandbox_ustr_len(lua_State * L)
+{
+ luasandbox_ustr_header *header;
+
+ header = luaL_checkudata( L, 1, "luasandbox_ustr" );
+
+ lua_pushinteger( L, header->cp_len );
+ return 1;
+}
+/* }}} */
+
+/** {{{ luasandbox_ustr_concat
+ *
+ * Lua function handling the concatention operator.
+ */
+int luasandbox_ustr_concat(lua_State * L)
+{
+ luasandbox_ustr_header *s1, *s2, *newhdr;
+ int32_t new_len;
+ void* newstr;
+
+ s1 = luasandbox_checkustring( L, 1 );
+ s2 = luasandbox_checkustring( L, 2 );
+
+ new_len = s1->raw_len + s2->raw_len;
+ newhdr = luasandbox_init_ustr( L, new_len );
+ newhdr->cp_len = s1->cp_len + s2->cp_len;
+ newstr = LUASANDBOX_USTR_RAW(newhdr);
+ memcpy( newstr, LUASANDBOX_USTR_RAW(s1), s1->raw_len );
+ memcpy( newstr + s1->raw_len, LUASANDBOX_USTR_RAW(s2), s2->raw_len );
+
+ return 1;
+}
+/* }}} */
+
+/** {{{ luasandbox_ustr_eq
+ *
+ * Lua function providing the equality operator.
+ */
+int luasandbox_ustr_eq(lua_State * L)
+{
+ luasandbox_ustr_header *s1, *s2;
+
+ s1 = luasandbox_checkustring( L, 1 );
+ s2 = luasandbox_checkustring( L, 2 );
+
+ if( s1->cp_len != s2->cp_len || s1->raw_len != s2->raw_len ) {
+ lua_pushboolean( L, FALSE );
+ return 1;
+ }
+
+ lua_pushboolean( L, !memcmp( LUASANDBOX_USTR_RAW(s1),
LUASANDBOX_USTR_RAW(s2), s1->raw_len ) );
+ return 1;
+}
+/* }}} */
+
+/** {{{ luasandbox_ustr_index
+ *
+ * Lua function providing the index operator.
+ * Provides access both to class methods and
+ * per-position access to string characters.
+ */
+int luasandbox_ustr_index(lua_State * L)
+{
+ luasandbox_ustr_header *str;
+ uint8_t *raw;
+
+ str = luaL_checkudata( L, 1, "luasandbox_ustr" );
+ raw = LUASANDBOX_USTR_RAW(str);
+
+ if( lua_type( L, 2 ) == LUA_TNUMBER ) {
+ // If it is a number, treat as accessing string by position
+ int32_t i, idx, curidx, offset;
+ uint8_t* result_pos;
+ UChar32 cur, result;
+
+ idx = lua_tointeger( L, 2 );
+ offset = luasandbox_ustr_index_to_offset( L, str, idx, TRUE );
+
+ for( i = curidx = 0; ; curidx++ ) {
+ UChar32 tmp;
+
+ U8_GET_UNSAFE( raw, i, result );
+ if( curidx == offset ) {
+ result_pos = raw + i;
+ break;
+ }
+ U8_NEXT_UNSAFE( raw, i, tmp );
+ }
+
+ lua_pushlstring( L, result_pos, U8_LENGTH( result ) );
+ return 1;
+ } else {
+ // Otherwise treat it as an access to member functions
+ lua_getglobal( L, "ustring" );
+ lua_pushvalue( L, 2 );
+ lua_gettable( L, -2 );
+ return 1;
+ }
+}
+/* }}} */
+
+/****************** Library ******************/
+
+/** {{{ luasandbox_ustr_ucfirst
+ *
+ * Lua function:
+ * ustring ucfirst( ustring str )
+ * Converts the first code point of str to upper case.
+ */
+int luasandbox_ustr_ucfirst(lua_State * L)
+{
+ luasandbox_ustr_header *header;
+ uint8_t *utf_string;
+ size_t raw_len;
+ UChar32 first, newfirst;
+ int offset = 0;
+
+ header = luaL_checkudata( L, 1, "luasandbox_ustr" );
+ utf_string = LUASANDBOX_USTR_RAW( header );
+ raw_len = header->raw_len;
+
+ if( !raw_len ) {
+ lua_pushstring( L, "" );
+ return 1;
+ }
+
+ U8_GET_UNSAFE( utf_string, 0, first );
+
+ newfirst = u_toupper( first );
+
+ // The actions depend upon whether the lengths of symbol match
+ if( U8_LENGTH(first) == U8_LENGTH(newfirst) ) {
+ // Just replace the symbol
+ luasandbox_ustr_header *newstr;
+ uint8_t *result;
+
+ newstr = lua_newuserdata( L, LUASANDBOX_USTR_TOTALLEN(header)
);
+ luaL_getmetatable( L, "luasandbox_ustr" );
+ lua_setmetatable( L, -2 );
+
+ memcpy( newstr, header, LUASANDBOX_USTR_TOTALLEN(header) );
+ result = LUASANDBOX_USTR_RAW(newstr);
+ U8_APPEND_UNSAFE( result, offset, newfirst );
+ } else {
+ // I have tested this code in cases when len(old) < len(new),
+ // but I am unaware of any cases when those lengths do not
match.
+ // It should have happened with eszett, but since capital
eszett is
+ // considered substandard, u_toupper does not convert it.
+ size_t oldlen = U8_LENGTH(first),
+ newlen = U8_LENGTH(newfirst);
+ size_t delta = newlen - oldlen;
+
+ uint8_t *result;
+ size_t new_len;
+
+ result = emalloc( raw_len + delta );
+ memcpy( result + newlen, utf_string + oldlen, raw_len - oldlen
);
+ U8_APPEND_UNSAFE( result, offset, newfirst );
+ new_len = raw_len + delta;
+
+ luasandbox_push_ustr( L, result, new_len );
+ efree( result );
+ }
+
+ return 1;
+}
+/* }}} */
+
+#define LUASANDBOX_UTF8_CHANGE_CASE_TOUPPER 1
+#define LUASANDBOX_UTF8_CHANGE_CASE_TOLOWER 2
+#define LUASANDBOX_UTF8_CHANGE_CASE_TOTITLE 3
+
+/** {{{ luasandbox_ustr_change_case
+ *
+ * Backend function for uc(), lc() and tc(). Converts string into UTF-16,
+ * passes it to ICU function and then converts back to UTF-8. This is required
+ * since casing algorithms are rather non-trivial and may be even
locale-dependant.
+ */
+static int luasandbox_ustr_change_case(lua_State * L, int action)
+{
+ UChar *utf16_orig, *utf16_result;
+ size_t orig_length, x;
+ int32_t result_len;
+ UErrorCode errorCode = U_ZERO_ERROR;
+
+ luasandbox_convert_toUTF16( L, 1 );
+ utf16_orig = (UChar*)lua_tolstring( L, -1, &orig_length );
+
+ utf16_result = emalloc( orig_length * 2 );
+ switch( action ) {
+ case LUASANDBOX_UTF8_CHANGE_CASE_TOUPPER:
+ result_len = u_strToUpper( utf16_result, orig_length,
utf16_orig, orig_length / 2, "", &errorCode );
+ break;
+ case LUASANDBOX_UTF8_CHANGE_CASE_TOLOWER:
+ result_len = u_strToLower( utf16_result, orig_length,
utf16_orig, orig_length / 2, "", &errorCode );
+ break;
+ case LUASANDBOX_UTF8_CHANGE_CASE_TOTITLE:
+ result_len = u_strToTitle( utf16_result, orig_length,
utf16_orig, orig_length / 2, NULL, "", &errorCode );
+ break;
+ }
+ LUASANDBOX_CHECK_ICU_ERROR( errorCode, efree(utf16_result) );
+ lua_pop( L, 1 ); // Pop UTF-16 string out of the stack
+
+ // Back to UTF-8
+ lua_pushlstring( L, utf16_result, result_len * 2 );
+ luasandbox_convert_fromUTF16( L, -1 );
+ lua_replace( L, -2 );
+ efree( utf16_result );
+
+ return 1;
+}
+/* }}} */
+
+int luasandbox_ustr_uc(lua_State * L)
+{
+ luasandbox_ustr_change_case( L, LUASANDBOX_UTF8_CHANGE_CASE_TOUPPER );
+}
+
+int luasandbox_ustr_lc(lua_State * L)
+{
+ luasandbox_ustr_change_case( L, LUASANDBOX_UTF8_CHANGE_CASE_TOLOWER );
+}
+
+int luasandbox_ustr_tc(lua_State * L)
+{
+ luasandbox_ustr_change_case( L, LUASANDBOX_UTF8_CHANGE_CASE_TOTITLE );
+}
+
+/** {{{ luasandbox_utf8_trim_lua
+ *
+ * Lua function:
+ * ustring trim( ustring str )
+ * Removes all the whitespace from the beginning and end of the string.
+ */
+int luasandbox_ustr_trim(lua_State * L)
+{
+ luasandbox_ustr_header *header, *newheader;
+ uint8_t *utf_string, *result;
+ size_t new_len;
+ UChar32 cur;
+ uint32_t i = 0, ltrim_len = 0, rtrim_len = 0, ltrim_len_cp = 0,
rtrim_len_cp = 0;
+
+ header = luasandbox_checkustring( L, 1 );
+ utf_string = LUASANDBOX_USTR_RAW(header);
+
+ // Left side
+ while( i < header->raw_len ) {
+ U8_NEXT_UNSAFE( utf_string, i, cur );
+
+ if( u_isWhitespace( cur ) || u_isUWhiteSpace( cur ) ) {
+ ltrim_len = i;
+ ltrim_len_cp++;
+ } else {
+ break;
+ }
+ }
+ // Right side
+ while( i < header->raw_len ) {
+ U8_NEXT_UNSAFE( utf_string, i, cur );
+
+ if( u_isWhitespace( cur ) || u_isUWhiteSpace( cur ) ) {
+ rtrim_len += U8_LENGTH( cur );
+ rtrim_len_cp++;
+ } else {
+ rtrim_len = 0;
+ rtrim_len_cp = 0;
+ }
+ }
+
+ new_len = header->raw_len - ltrim_len - rtrim_len;
+ newheader = luasandbox_init_ustr( L, new_len );
+ newheader->cp_len = header->cp_len - ltrim_len_cp - rtrim_len_cp;
+ memcpy( LUASANDBOX_USTR_RAW(newheader), utf_string + ltrim_len, new_len
);
+
+ return 1;
+}
+/* }}} */
+
+/** {{{ luasandbox_ustr_sub
+ *
+ * Lua function:
+ * ustring sub( ustring str, int offset[, int length] )
+ * Returns the substring of str. Starts from the offset,
+ * and returns at most length code points.
+ */
+int luasandbox_ustr_sub(lua_State * L)
+{
+ luasandbox_ustr_header *header;
+ uint8_t *utf_string, *result;
+ size_t len;
+
+ int32_t i = 0, idx = 0, target = 0, target_len;
+ int32_t target_start, target_end = -1;
+ int found = 0;
+ UChar32 cur;
+
+ header = luasandbox_checkustring( L, 1 );
+ utf_string = LUASANDBOX_USTR_RAW(header);
+ target = luaL_checkinteger( L, 2 );
+ if( lua_type( L, 3 ) == LUA_TNUMBER ) {
+ target_len = lua_tointeger( L, 3 );
+ } else {
+ target_len = -1;
+ }
+
+ target = luasandbox_ustr_index_to_offset( L, header, target, TRUE );
+
+ // Find the start symbol
+ while( i < header->raw_len ) {
+ if( idx == target ) {
+ found = TRUE;
+ break;
+ }
+
+ U8_NEXT_UNSAFE( utf_string, i, cur );
+ idx++;
+ }
+
+ // If start symbol index is larger than string size, return null
+ if( !found ) {
+ lua_pushstring( L, "" );
+ return 1;
+ }
+
+ target_start = i;
+ idx = 0;
+
+ // Find the end position
+ while( i < header->raw_len ) {
+ if( idx == target_len ) {
+ target_end = i;
+ break;
+ }
+
+ U8_NEXT_UNSAFE( utf_string, i, cur );
+ idx++;
+ }
+
+ if( target_end == -1 ) {
+ target_end = header->raw_len;
+ }
+
+ luasandbox_push_ustr( L, utf_string + target_start, target_end -
target_start );
+ return 1;
+}
+/* }}} */
+
+/****************** Substring search and related operators. Beware.
******************/
+
+typedef struct {
+ UChar32* string; // UTF-32 representation of the needle string
+ int32_t* table; // KMP table
+ int32_t length; // Length of the needle string in code points
+ int32_t raw_length; // Length of the needle string in UTF-8 bytes
+ int singleCharMode; // Whether the needle string is a single
character
+} ustr_needle_string;
+
+#define UTF8_SEARCH_STATUS_FOUND 1
+#define UTF8_SEARCH_STATUS_NOTFOUND 0
+
+typedef struct {
+ int32_t status; // Status of the search
+ int32_t raw_index; // Index in bytes
+ int32_t cp_index; // Index in codepoints
+} ustr_search_result;
+
+/** {{{ luasandbox_ustr_search_prepare
+ *
+ * Preprocesses the string so a search may be performed on it using KMP
algorithm.
+ */
+static ustr_needle_string* luasandbox_ustr_search_prepare(uint8_t* utf_string,
int32_t raw_len)
+{
+ ustr_needle_string* str;
+ int32_t i, idx;
+ UChar32 cur;
+ UErrorCode errorCode = U_ZERO_ERROR;
+ int32_t cnd = 0;
+
+ // Here we use the worst-case allocation
+ str = emalloc( sizeof( ustr_needle_string ) );
+ memset( str, 0, sizeof( ustr_needle_string ) );
+ str->string = emalloc( raw_len * 4 );
+ str->raw_length = raw_len;
+
+ // Convert UTF-8 to UTF-32 for search purposes
+ for( i = idx = 0; i < raw_len; idx++ ) {
+ U8_NEXT_UNSAFE( utf_string, i, cur );
+ str->string[idx] = cur;
+ }
+ str->length = idx;
+
+ // KMP cannot handle single character search
+ // (or it can, but my implementation cannot)
+ // Use special case handler
+ str->singleCharMode = str->length == 1;
+ if( str->singleCharMode )
+ return str;
+
+ // Fill the search prefix table
+ str->table = emalloc( str->length * sizeof(int32_t) );
+ str->table[0] = -1; // Yes, UChar32 is a signed type. "U" is for
"Unicode", not for "unsigned"
+ str->table[1] = 0;
+ for( i = 2; i < str->length; i++ ) {
+ if( str->string[i - 1] == str->string[cnd] ) {
+ cnd++;
+ str->table[i] = cnd;
+ } else if( cnd > 0 ) {
+ cnd = str->table[cnd];
+ i--;
+ } else {
+ str->table[i] = 0;
+ }
+ }
+
+ return str;
+}
+
+/** {{{ luasandbox_ustr_search_free
+ *
+ * Frees the memory allocated for the preprocessed needle string.
+ */
+void luasandbox_ustr_search_free(ustr_needle_string *needle)
+{
+ if( needle->table )
+ efree( needle->table );
+ efree( needle->string );
+ efree( needle );
+}
+
+#define UTF8_SEARCH_OFFSET_NONE 0
+#define UTF8_SEARCH_OFFSET_RAW 1
+#define UTF8_SEARCH_OFFSET_CP 2
+
+/** {{{ luasandbox_ustr_search
+ *
+ * Performs search of a substring in a string using the Knuth-Morris-Pratt
algorithm.
+ * Allows different types of start offset. The needle string must be
preprocessed.
+ */
+ustr_search_result luasandbox_ustr_search(uint8_t *haystack, int32_t
haystack_len, int offset_type, int offset, ustr_needle_string* needle) {
+ int i, j, idx; // Raw offset in haystack, CP offset in needle, CP
offset in haystack
+ UChar32 cur;
+ ustr_search_result result;
+
+ // Defaults
+ result.raw_index = -1;
+ result.cp_index = -1;
+
+ // If we are given raw offset, start with it
+ if( offset_type == UTF8_SEARCH_OFFSET_RAW ) {
+ i = offset;
+ } else {
+ i = 0;
+ }
+
+ if( needle->singleCharMode ) {
+ // Handle special case of single character
+ for( idx = 0; i < haystack_len; idx++ ) {
+ U8_NEXT_UNSAFE( haystack, i, cur );
+
+ if( offset_type == UTF8_SEARCH_OFFSET_CP && idx <
offset )
+ continue;
+
+ if( needle->string[0] == cur ) {
+ result.status = UTF8_SEARCH_STATUS_FOUND;
+ result.cp_index = idx;
+ result.raw_index = i - needle->raw_length;
+ return result;
+ }
+ }
+ } else {
+ // Otherwise use KMP search
+ for( j = idx = 0; i < haystack_len; idx++ ) {
+ U8_NEXT_UNSAFE( haystack, i, cur );
+
+ if( offset_type == UTF8_SEARCH_OFFSET_CP && idx <
offset )
+ continue;
+
+ while( j > 0 && needle->string[j] != cur ) {
+ j = needle->table[j];
+ }
+ if( needle->string[j] == cur )
+ j++;
+ if( j == needle->length ) {
+ result.status = UTF8_SEARCH_STATUS_FOUND;
+ result.cp_index = (idx+1) - needle->length;
+ result.raw_index = i - needle->raw_length;
+ return result;
+ }
+ }
+ }
+
+ result.status = UTF8_SEARCH_STATUS_NOTFOUND;
+ return result;
+}
+/* }}} */
+
+/** {{{ luasandbox_ustr_pos
+ *
+ * Lua function
+ * int pos( ustring haystack, ustring needle[, int offset] )
+ * Searches for a substring in a string. Returns an offset
+ * according to Lua conventions (starting with 1).
+ */
+int luasandbox_ustr_pos(lua_State * L)
+{
+ luasandbox_ustr_header *header_haystack, *header_needle;
+ uint8_t *haystack, *needle_raw;
+ ustr_needle_string *needle;
+ int32_t offset;
+ ustr_search_result result;
+
+ header_haystack = luasandbox_checkustring( L, 1 );
+ header_needle = luasandbox_checkustring( L, 2 );
+
+ haystack = LUASANDBOX_USTR_RAW(header_haystack);
+ needle_raw = LUASANDBOX_USTR_RAW(header_needle);
+ if( lua_type( L, 3 ) == LUA_TNUMBER ) {
+ offset = lua_tointeger( L, 3 );
+ } else {
+ offset = 1;
+ }
+
+ offset = luasandbox_ustr_index_to_offset( L, header_haystack, offset,
TRUE );
+
+ if( !header_needle->raw_len ) {
+ lua_pushstring( L, "The needle parameter may not be empty" );
+ lua_error( L );
+ }
+
+ needle = luasandbox_ustr_search_prepare( needle_raw,
header_needle->raw_len );
+
+ result = luasandbox_ustr_search( haystack, header_haystack->raw_len,
UTF8_SEARCH_OFFSET_CP, offset, needle );
+ luasandbox_ustr_search_free( needle );
+
+ switch( result.status ) {
+ case UTF8_SEARCH_STATUS_FOUND:
+ lua_pushinteger( L, result.cp_index + 1 );
+ return 1;
+ case UTF8_SEARCH_STATUS_NOTFOUND:
+ lua_pushinteger( L, -1 );
+ return 1;
+ }
+}
+/* }}} */
+
+/** {{{ luasandbox_ustr_replace
+ *
+ * Lua function:
+ * replace( ustring haystack, ustring needle, ustring replacement[, int
offset[, int limit]] )
+ * Replaces at most limit occurances of needle in haystack with replacement,
+ * starting at offset.
+ */
+int luasandbox_ustr_replace(lua_State * L)
+{
+ luasandbox_ustr_header *header_haystack, *header_needle,
*header_replacement, *header_result;
+ uint8_t *haystack, *needle_raw, *replacement, *result;
+ size_t haystack_len, needle_len, replacement_len, result_len;
+ ustr_needle_string *needle;
+ ustr_search_result cur;
+ int32_t i, offset, offset_src, offset_dest, matches_num, limit;
+ int32_t *matches;
+ int offset_mode;
+
+ header_haystack = luasandbox_checkustring( L, 1 );
+ header_needle = luasandbox_checkustring( L, 2 );
+ header_replacement = luasandbox_checkustring( L, 3 );
+
+ haystack = LUASANDBOX_USTR_RAW(header_haystack);
+ haystack_len = header_haystack->raw_len;
+ needle_raw = LUASANDBOX_USTR_RAW(header_needle);
+ needle_len = header_needle->raw_len;
+ replacement = LUASANDBOX_USTR_RAW(header_replacement);
+ replacement_len = header_replacement->raw_len;
+
+ if( lua_type( L, 4 ) == LUA_TNUMBER ) {
+ offset = lua_tointeger( L, 4 );
+ offset = luasandbox_ustr_index_to_offset( L, header_haystack,
offset, TRUE );
+ offset_mode = UTF8_SEARCH_OFFSET_CP;
+ } else {
+ offset = 0;
+ offset_mode = UTF8_SEARCH_OFFSET_RAW;
+ }
+ limit = ( lua_type( L, 5 ) == LUA_TNUMBER ) ?
+ luaL_checkinteger( L, 5 ) :
+ -1;
+
+ if( !needle_len ) {
+ lua_pushstring( L, "The needle parameter may not be empty" );
+ lua_error( L );
+ }
+
+ needle = luasandbox_ustr_search_prepare( needle_raw, needle_len );
+
+ // As usually, just use worst-case scenario for memory allocation
+ matches = emalloc( ( haystack_len / needle_len + 1 ) * sizeof(int32_t)
);
+
+ // Find all substrings to repalce
+ matches_num = 0;
+ for(;;) {
+ if( limit > 0 && matches_num >= limit ) {
+ break;
+ }
+
+ cur = luasandbox_ustr_search( haystack, haystack_len,
offset_mode, offset, needle );
+
+ if( cur.status == UTF8_SEARCH_STATUS_FOUND ) {
+ matches[matches_num] = cur.raw_index;
+ matches_num++;
+ offset = cur.raw_index + needle->raw_length;
+ offset_mode = UTF8_SEARCH_OFFSET_RAW;
+ } else {
+ break;
+ }
+ }
+ luasandbox_ustr_search_free( needle );
+
+ if( !matches_num ) {
+ lua_pushvalue( L, 1 );
+ return 1;
+ }
+
+ // Initialize the resulting string
+ result_len = haystack_len + ( replacement_len - needle_len ) *
matches_num;
+ header_result = luasandbox_init_ustr( L, result_len );
+ header_result->cp_len = header_haystack->cp_len +
+ ( header_replacement->raw_len - header_needle->raw_len ) *
matches_num;
+ result = LUASANDBOX_USTR_RAW(header_result);
+
+ // Replace all substrings
+ memcpy( result, haystack, matches[i] );
+ offset_src = offset_dest = matches[i];
+ for( i = 0; i < matches_num; i++ ) {
+ int32_t postfix_len;
+
+ memcpy( result + offset_dest, replacement, replacement_len );
+ offset_src += needle_len;
+ offset_dest += replacement_len;
+
+ if( i == matches_num - 1 ) {
+ postfix_len = haystack_len - offset_src;
+ } else {
+ postfix_len = matches[i+1] - offset_src;
+ }
+
+ memcpy( result + offset_dest, haystack + offset_src,
postfix_len );
+ offset_src += postfix_len;
+ offset_dest += postfix_len;
+ }
+
+ efree( matches );
+
+ return 1;
+}
+/* }}} */
+
+/** {{{ luasandbox_ustr_split
+ *
+ * Lua function:
+ * split( ustring haystack, ustring separator[, int limit] )
+ *
+ */
+int luasandbox_ustr_split(lua_State * L)
+{
+ luasandbox_ustr_header *header_haystack, *header_needle;
+ uint8_t *haystack, *needle_raw;
+ size_t haystack_len, needle_len;
+ ustr_needle_string *needle;
+ ustr_search_result cur;
+ int32_t i, offset, matches_num, limit;
+ int32_t *matches;
+
+ header_haystack = luasandbox_checkustring( L, 1 );
+ header_needle = luasandbox_checkustring( L, 2 );
+
+ haystack = LUASANDBOX_USTR_RAW(header_haystack);
+ needle_raw = LUASANDBOX_USTR_RAW(header_needle);
+ haystack_len = header_haystack->raw_len;
+ needle_len = header_needle->raw_len;
+
+ limit = ( lua_tointeger( L, 3 ) == LUA_TNUMBER ) ?
+ luaL_checkinteger( L, 3 ) :
+ -1;
+
+ if( !needle_len ) {
+ lua_pushstring( L, "The needle parameter may not be empty" );
+ lua_error( L );
+ }
+
+ needle = luasandbox_ustr_search_prepare( needle_raw, needle_len );
+ if( !needle ) {
+ LUASANDBOX_UNICODE_INVALID_FAIL();
+ }
+
+ // As usually, just use worst-case scenario for memory allocation
+ matches = emalloc( ( haystack_len / needle_len + 1 ) * sizeof(int32_t)
);
+
+ // Find all substrings to split
+ matches_num = 0;
+ offset = 0;
+ for(;;) {
+ if( limit > 0 && matches_num >= limit ) {
+ break;
+ }
+
+ cur = luasandbox_ustr_search( haystack, haystack_len,
UTF8_SEARCH_OFFSET_RAW, offset, needle );
+
+ if( cur.status == UTF8_SEARCH_STATUS_FOUND ) {
+ matches[matches_num] = cur.raw_index;
+ matches_num++;
+ offset = cur.raw_index + needle->raw_length;
+ } else {
+ break;
+ }
+ }
+ luasandbox_ustr_search_free( needle );
+
+ lua_createtable( L, matches_num + 1, 0 );
+
+ if( !matches_num ) {
+ lua_pushlstring( L, haystack, haystack_len );
+ lua_rawseti( L, -2, 1 );
+ return 1;
+ }
+
+ // Push all matches into the table
+ lua_pushlstring( L, haystack, matches[0] );
+ lua_rawseti( L, -2, 1 );
+ offset = matches[0];
+ for( i = 0; i < matches_num; i++ ) {
+ int32_t bit_len;
+
+ offset += needle_len;
+
+ if( i == matches_num - 1 ) {
+ bit_len = haystack_len - offset;
+ } else {
+ bit_len = matches[i+1] - offset;
+ }
+
+ lua_pushlstring( L, haystack + offset, bit_len );
+ lua_rawseti( L, -2, i + 2 );
+ offset += bit_len;
+ }
+
+ return 1;
+}
+/* }}} */
Property changes on: trunk/php/luasandbox/ustring.c
___________________________________________________________________
Added: svn:eol-style
+ native
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs