Added: subversion/upstream/utf8proc/pgsql/utf8proc.sql URL: http://svn.apache.org/viewvc/subversion/upstream/utf8proc/pgsql/utf8proc.sql?rev=1405742&view=auto ============================================================================== --- subversion/upstream/utf8proc/pgsql/utf8proc.sql (added) +++ subversion/upstream/utf8proc/pgsql/utf8proc.sql Mon Nov 5 10:54:56 2012 @@ -0,0 +1,6 @@ +CREATE OR REPLACE FUNCTION unifold (text) RETURNS text + LANGUAGE 'C' IMMUTABLE STRICT AS '$libdir/utf8proc_pgsql.so', + 'utf8proc_pgsql_unifold'; +CREATE OR REPLACE FUNCTION unistrip (text) RETURNS text + LANGUAGE 'C' IMMUTABLE STRICT AS '$libdir/utf8proc_pgsql.so', + 'utf8proc_pgsql_unistrip';
Added: subversion/upstream/utf8proc/pgsql/utf8proc_pgsql.c URL: http://svn.apache.org/viewvc/subversion/upstream/utf8proc/pgsql/utf8proc_pgsql.c?rev=1405742&view=auto ============================================================================== --- subversion/upstream/utf8proc/pgsql/utf8proc_pgsql.c (added) +++ subversion/upstream/utf8proc/pgsql/utf8proc_pgsql.c Mon Nov 5 10:54:56 2012 @@ -0,0 +1,139 @@ +/* + * Copyright (c) Public Software Group e. V., Berlin, Germany + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + +/* + * File name: pgsql/utf8proc_pgsql.c + * + * Description: + * PostgreSQL extension to provide two functions 'unifold' and 'unistrip', + * which can be used to case-fold and normalize index fields and + * optionally strip marks (e.g. accents) from strings. + */ + + +#include "../utf8proc.c" + +#include <postgres.h> +#include <utils/elog.h> +#include <fmgr.h> +#include <string.h> +#include <unistd.h> +#include <utils/builtins.h> + +#ifdef PG_MODULE_MAGIC +PG_MODULE_MAGIC; +#endif + +#define UTF8PROC_PGSQL_FOLD_OPTS ( UTF8PROC_REJECTNA | UTF8PROC_COMPAT | \ + UTF8PROC_COMPOSE | UTF8PROC_STABLE | UTF8PROC_IGNORE | UTF8PROC_STRIPCC | \ + UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD | UTF8PROC_LUMP ) +#define UTF8PROC_PGSQL_STRIP_OPTS ( UTF8PROC_REJECTNA | UTF8PROC_COMPAT | \ + UTF8PROC_COMPOSE | UTF8PROC_STABLE | UTF8PROC_IGNORE | UTF8PROC_STRIPCC | \ + UTF8PROC_NLF2LF | UTF8PROC_CASEFOLD | UTF8PROC_LUMP | UTF8PROC_STRIPMARK ) + +ssize_t utf8proc_pgsql_utf8map( + text *input_string, text **output_string_ptr, int options +) { + ssize_t result; + text *output_string; + result = utf8proc_decompose( + VARDATA(input_string), VARSIZE(input_string) - VARHDRSZ, + NULL, 0, options + ); + if (result < 0) return result; + if (result > (SIZE_MAX-1-VARHDRSZ)/sizeof(int32_t)) + return UTF8PROC_ERROR_OVERFLOW; + /* reserve one extra byte for termination */ + *output_string_ptr = palloc(result * sizeof(int32_t) + 1 + VARHDRSZ); + output_string = *output_string_ptr; + if (!output_string) return UTF8PROC_ERROR_NOMEM; + result = utf8proc_decompose( + VARDATA(input_string), VARSIZE(input_string) - VARHDRSZ, + (int32_t *)VARDATA(output_string), result, options + ); + if (result < 0) return result; + result = utf8proc_reencode( + (int32_t *)VARDATA(output_string), result, options + ); + if (result >= 0) SET_VARSIZE(output_string, result + VARHDRSZ); + return result; +} + +void utf8proc_pgsql_utf8map_errchk(ssize_t result, text *output_string) { + if (result < 0) { + int sqlerrcode; + if (output_string) pfree(output_string); + switch(result) { + case UTF8PROC_ERROR_NOMEM: + sqlerrcode = ERRCODE_OUT_OF_MEMORY; break; + case UTF8PROC_ERROR_OVERFLOW: + sqlerrcode = ERRCODE_PROGRAM_LIMIT_EXCEEDED; break; + case UTF8PROC_ERROR_INVALIDUTF8: + case UTF8PROC_ERROR_NOTASSIGNED: + return; + default: + sqlerrcode = ERRCODE_INTERNAL_ERROR; + } + ereport(ERROR, ( + errcode(sqlerrcode), + errmsg("%s", utf8proc_errmsg(result)) + )); + } +} + +PG_FUNCTION_INFO_V1(utf8proc_pgsql_unifold); +Datum utf8proc_pgsql_unifold(PG_FUNCTION_ARGS) { + text *input_string; + text *output_string = NULL; + ssize_t result; + input_string = PG_GETARG_TEXT_P(0); + result = utf8proc_pgsql_utf8map( + input_string, &output_string, UTF8PROC_PGSQL_FOLD_OPTS + ); + PG_FREE_IF_COPY(input_string, 0); + utf8proc_pgsql_utf8map_errchk(result, output_string); + if (result >= 0) { + PG_RETURN_TEXT_P(output_string); + } else { + PG_RETURN_NULL(); + } +} + +PG_FUNCTION_INFO_V1(utf8proc_pgsql_unistrip); +Datum utf8proc_pgsql_unistrip(PG_FUNCTION_ARGS) { + text *input_string; + text *output_string = NULL; + ssize_t result; + input_string = PG_GETARG_TEXT_P(0); + result = utf8proc_pgsql_utf8map( + input_string, &output_string, UTF8PROC_PGSQL_STRIP_OPTS + ); + PG_FREE_IF_COPY(input_string, 0); + utf8proc_pgsql_utf8map_errchk(result, output_string); + if (result >= 0) { + PG_RETURN_TEXT_P(output_string); + } else { + PG_RETURN_NULL(); + } +} + Added: subversion/upstream/utf8proc/ruby/extconf.rb URL: http://svn.apache.org/viewvc/subversion/upstream/utf8proc/ruby/extconf.rb?rev=1405742&view=auto ============================================================================== --- subversion/upstream/utf8proc/ruby/extconf.rb (added) +++ subversion/upstream/utf8proc/ruby/extconf.rb Mon Nov 5 10:54:56 2012 @@ -0,0 +1,2 @@ +require 'mkmf' +create_makefile("utf8proc_native") Added: subversion/upstream/utf8proc/ruby/gem/LICENSE URL: http://svn.apache.org/viewvc/subversion/upstream/utf8proc/ruby/gem/LICENSE?rev=1405742&view=auto ============================================================================== --- subversion/upstream/utf8proc/ruby/gem/LICENSE (added) +++ subversion/upstream/utf8proc/ruby/gem/LICENSE Mon Nov 5 10:54:56 2012 @@ -0,0 +1,64 @@ + +Copyright (c) 2009 Public Software Group e. V., Berlin, Germany + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + + +This software distribution contains derived data from a modified version of +the Unicode data files. The following license applies to that data: + +COPYRIGHT AND PERMISSION NOTICE + +Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed +under the Terms of Use in http://www.unicode.org/copyright.html. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of the Unicode data files and any associated documentation (the "Data +Files") or Unicode software and any associated documentation (the +"Software") to deal in the Data Files or Software without restriction, +including without limitation the rights to use, copy, modify, merge, +publish, distribute, and/or sell copies of the Data Files or Software, and +to permit persons to whom the Data Files or Software are furnished to do +so, provided that (a) the above copyright notice(s) and this permission +notice appear with all copies of the Data Files or Software, (b) both the +above copyright notice(s) and this permission notice appear in associated +documentation, and (c) there is clear notice in each modified Data File or +in the Software as well as in the documentation associated with the Data +File(s) or Software that the data or software has been modified. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF +THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS +INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR +CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF +USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THE DATA FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder shall +not be used in advertising or otherwise to promote the sale, use or other +dealings in these Data Files or Software without prior written +authorization of the copyright holder. + + +Unicode and the Unicode logo are trademarks of Unicode, Inc., and may be +registered in some jurisdictions. All other trademarks and registered +trademarks mentioned herein are the property of their respective owners. + Added: subversion/upstream/utf8proc/ruby/gem/utf8proc.gemspec URL: http://svn.apache.org/viewvc/subversion/upstream/utf8proc/ruby/gem/utf8proc.gemspec?rev=1405742&view=auto ============================================================================== --- subversion/upstream/utf8proc/ruby/gem/utf8proc.gemspec (added) +++ subversion/upstream/utf8proc/ruby/gem/utf8proc.gemspec Mon Nov 5 10:54:56 2012 @@ -0,0 +1,12 @@ +require 'rubygems' +SPEC = Gem::Specification.new do |s| + s.name = 'utf8proc' + s.version = '1.1.5' + s.author = 'Public Software Group e. V., Berlin, Germany' + s.homepage = 'http://www.public-software-group.org/utf8proc' + s.summary = 'UTF-8 Unicode string processing' + s.files = ['LICENSE', 'lib/utf8proc.rb', 'ext/utf8proc_native.c'] + s.require_path = 'lib/' + s.extensions = ['ext/extconf.rb'] + s.has_rdoc = false +end Added: subversion/upstream/utf8proc/ruby/utf8proc.rb URL: http://svn.apache.org/viewvc/subversion/upstream/utf8proc/ruby/utf8proc.rb?rev=1405742&view=auto ============================================================================== --- subversion/upstream/utf8proc/ruby/utf8proc.rb (added) +++ subversion/upstream/utf8proc/ruby/utf8proc.rb Mon Nov 5 10:54:56 2012 @@ -0,0 +1,98 @@ +# Copyright (c) 2009 Public Software Group e. V., Berlin, Germany +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# +# File name: ruby/utf8proc.rb +# +# Description: +# Part of the ruby wrapper for libutf8proc, which is written in ruby. +# + + +require 'utf8proc_native' + + +module Utf8Proc + + SpecialChars = { + :HT => "\x09", + :LF => "\x0A", + :VT => "\x0B", + :FF => "\x0C", + :CR => "\x0D", + :FS => "\x1C", + :GS => "\x1D", + :RS => "\x1E", + :US => "\x1F", + :LS => "\xE2\x80\xA8", + :PS => "\xE2\x80\xA9", + } + + module StringExtensions + def utf8map(*option_array) + options = 0 + option_array.each do |option| + flag = Utf8Proc::Options[option] + raise ArgumentError, "Unknown argument given to String#utf8map." unless + flag + options |= flag + end + return Utf8Proc::utf8map(self, options) + end + def utf8map!(*option_array) + self.replace(self.utf8map(*option_array)) + end + def utf8nfd; utf8map( :stable, :decompose); end + def utf8nfd!; utf8map!(:stable, :decompose); end + def utf8nfc; utf8map( :stable, :compose); end + def utf8nfc!; utf8map!(:stable, :compose); end + def utf8nfkd; utf8map( :stable, :decompose, :compat); end + def utf8nfkd!; utf8map!(:stable, :decompose, :compat); end + def utf8nfkc; utf8map( :stable, :compose, :compat); end + def utf8nfkc!; utf8map!(:stable, :compose, :compat); end + def utf8chars + result = self.utf8map(:charbound).split("\377") + result.shift if result.first == "" + result + end + def char_ary + # depecated, use String#utf8chars instead + utf8chars + end + end + + module IntegerExtensions + def utf8 + return Utf8Proc::utf8char(self) + end + end + +end + + +class String + include(Utf8Proc::StringExtensions) +end + +class Integer + include(Utf8Proc::IntegerExtensions) +end + Added: subversion/upstream/utf8proc/ruby/utf8proc_native.c URL: http://svn.apache.org/viewvc/subversion/upstream/utf8proc/ruby/utf8proc_native.c?rev=1405742&view=auto ============================================================================== --- subversion/upstream/utf8proc/ruby/utf8proc_native.c (added) +++ subversion/upstream/utf8proc/ruby/utf8proc_native.c Mon Nov 5 10:54:56 2012 @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + +/* + * File name: ruby/utf8proc_native.c + * + * Description: + * Native part of the ruby wrapper for libutf8proc. + */ + + +#include "../utf8proc.c" +#include "ruby.h" + +#ifndef RSTRING_PTR +#define RSTRING_PTR(s) (RSTRING(s)->ptr) +#endif +#ifndef RSTRING_LEN +#define RSTRING_LEN(s) (RSTRING(s)->len) +#endif + +typedef struct utf8proc_ruby_mapenv_struct { + int32_t *buffer; +} utf8proc_ruby_mapenv_t; + +void utf8proc_ruby_mapenv_free(utf8proc_ruby_mapenv_t *env) { + free(env->buffer); + free(env); +} + +VALUE utf8proc_ruby_module; +VALUE utf8proc_ruby_options; +VALUE utf8proc_ruby_eUnicodeError; +VALUE utf8proc_ruby_eInvalidUtf8Error; +VALUE utf8proc_ruby_eCodeNotAssignedError; + +VALUE utf8proc_ruby_map_error(ssize_t result) { + VALUE excpt_class; + switch (result) { + case UTF8PROC_ERROR_NOMEM: + excpt_class = rb_eNoMemError; break; + case UTF8PROC_ERROR_OVERFLOW: + case UTF8PROC_ERROR_INVALIDOPTS: + excpt_class = rb_eArgError; break; + case UTF8PROC_ERROR_INVALIDUTF8: + excpt_class = utf8proc_ruby_eInvalidUtf8Error; break; + case UTF8PROC_ERROR_NOTASSIGNED: + excpt_class = utf8proc_ruby_eCodeNotAssignedError; break; + default: + excpt_class = rb_eRuntimeError; + } + rb_raise(excpt_class, "%s", utf8proc_errmsg(result)); + return Qnil; +} + +VALUE utf8proc_ruby_map(VALUE self, VALUE str_param, VALUE options_param) { + VALUE str; + int options; + VALUE env_obj; + utf8proc_ruby_mapenv_t *env; + ssize_t result; + VALUE retval; + str = StringValue(str_param); + options = NUM2INT(options_param) & ~UTF8PROC_NULLTERM; + env_obj = Data_Make_Struct(rb_cObject, utf8proc_ruby_mapenv_t, NULL, + utf8proc_ruby_mapenv_free, env); + result = utf8proc_decompose(RSTRING_PTR(str), RSTRING_LEN(str), + NULL, 0, options); + if (result < 0) { + utf8proc_ruby_map_error(result); + return Qnil; /* needed to prevent problems with optimization */ + } + env->buffer = ALLOC_N(int32_t, result+1); + result = utf8proc_decompose(RSTRING_PTR(str), RSTRING_LEN(str), + env->buffer, result, options); + if (result < 0) { + free(env->buffer); + env->buffer = 0; + utf8proc_ruby_map_error(result); + return Qnil; /* needed to prevent problems with optimization */ + } + result = utf8proc_reencode(env->buffer, result, options); + if (result < 0) { + free(env->buffer); + env->buffer = 0; + utf8proc_ruby_map_error(result); + return Qnil; /* needed to prevent problems with optimization */ + } + retval = rb_str_new((char *)env->buffer, result); + free(env->buffer); + env->buffer = 0; + return retval; +} + +static VALUE utf8proc_ruby_char(VALUE self, VALUE code_param) { + char buffer[4]; + ssize_t result; + int uc; + uc = NUM2INT(code_param); + if (!utf8proc_codepoint_valid(uc)) + rb_raise(rb_eArgError, "Invalid Unicode code point"); + result = utf8proc_encode_char(uc, buffer); + return rb_str_new(buffer, result); +} + +#define register_utf8proc_option(sym, field) \ + rb_hash_aset(utf8proc_ruby_options, ID2SYM(rb_intern(sym)), INT2FIX(field)) + +void Init_utf8proc_native() { + utf8proc_ruby_module = rb_define_module("Utf8Proc"); + rb_define_module_function(utf8proc_ruby_module, "utf8map", + utf8proc_ruby_map, 2); + rb_define_module_function(utf8proc_ruby_module, "utf8char", + utf8proc_ruby_char, 1); + utf8proc_ruby_eUnicodeError = rb_define_class_under(utf8proc_ruby_module, + "UnicodeError", rb_eStandardError); + utf8proc_ruby_eInvalidUtf8Error = rb_define_class_under( + utf8proc_ruby_module, "InvalidUtf8Error", utf8proc_ruby_eUnicodeError); + utf8proc_ruby_eCodeNotAssignedError = rb_define_class_under( + utf8proc_ruby_module, "CodeNotAssignedError", + utf8proc_ruby_eUnicodeError); + utf8proc_ruby_options = rb_hash_new(); + register_utf8proc_option("stable", UTF8PROC_STABLE); + register_utf8proc_option("compat", UTF8PROC_COMPAT); + register_utf8proc_option("compose", UTF8PROC_COMPOSE); + register_utf8proc_option("decompose", UTF8PROC_DECOMPOSE); + register_utf8proc_option("ignore", UTF8PROC_IGNORE); + register_utf8proc_option("rejectna", UTF8PROC_REJECTNA); + register_utf8proc_option("nlf2ls", UTF8PROC_NLF2LS); + register_utf8proc_option("nlf2ps", UTF8PROC_NLF2PS); + register_utf8proc_option("nlf2lf", UTF8PROC_NLF2LF); + register_utf8proc_option("stripcc", UTF8PROC_STRIPCC); + register_utf8proc_option("casefold", UTF8PROC_CASEFOLD); + register_utf8proc_option("charbound", UTF8PROC_CHARBOUND); + register_utf8proc_option("lump", UTF8PROC_LUMP); + register_utf8proc_option("stripmark", UTF8PROC_STRIPMARK); + OBJ_FREEZE(utf8proc_ruby_options); + rb_define_const(utf8proc_ruby_module, "Options", utf8proc_ruby_options); +} + Added: subversion/upstream/utf8proc/utf8proc.c URL: http://svn.apache.org/viewvc/subversion/upstream/utf8proc/utf8proc.c?rev=1405742&view=auto ============================================================================== --- subversion/upstream/utf8proc/utf8proc.c (added) +++ subversion/upstream/utf8proc/utf8proc.c Mon Nov 5 10:54:56 2012 @@ -0,0 +1,587 @@ +/* + * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* + * This library contains derived data from a modified version of the + * Unicode data files. + * + * The original data files are available at + * http://www.unicode.org/Public/UNIDATA/ + * + * Please notice the copyright statement in the file "utf8proc_data.c". + */ + + +/* + * File name: utf8proc.c + * + * Description: + * Implementation of libutf8proc. + */ + + +#include "utf8proc.h" +#include "utf8proc_data.c" + + +const int8_t utf8proc_utf8class[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; + +#define UTF8PROC_HANGUL_SBASE 0xAC00 +#define UTF8PROC_HANGUL_LBASE 0x1100 +#define UTF8PROC_HANGUL_VBASE 0x1161 +#define UTF8PROC_HANGUL_TBASE 0x11A7 +#define UTF8PROC_HANGUL_LCOUNT 19 +#define UTF8PROC_HANGUL_VCOUNT 21 +#define UTF8PROC_HANGUL_TCOUNT 28 +#define UTF8PROC_HANGUL_NCOUNT 588 +#define UTF8PROC_HANGUL_SCOUNT 11172 +/* END is exclusive */ +#define UTF8PROC_HANGUL_L_START 0x1100 +#define UTF8PROC_HANGUL_L_END 0x115A +#define UTF8PROC_HANGUL_L_FILLER 0x115F +#define UTF8PROC_HANGUL_V_START 0x1160 +#define UTF8PROC_HANGUL_V_END 0x11A3 +#define UTF8PROC_HANGUL_T_START 0x11A8 +#define UTF8PROC_HANGUL_T_END 0x11FA +#define UTF8PROC_HANGUL_S_START 0xAC00 +#define UTF8PROC_HANGUL_S_END 0xD7A4 + + +#define UTF8PROC_BOUNDCLASS_START 0 +#define UTF8PROC_BOUNDCLASS_OTHER 1 +#define UTF8PROC_BOUNDCLASS_CR 2 +#define UTF8PROC_BOUNDCLASS_LF 3 +#define UTF8PROC_BOUNDCLASS_CONTROL 4 +#define UTF8PROC_BOUNDCLASS_EXTEND 5 +#define UTF8PROC_BOUNDCLASS_L 6 +#define UTF8PROC_BOUNDCLASS_V 7 +#define UTF8PROC_BOUNDCLASS_T 8 +#define UTF8PROC_BOUNDCLASS_LV 9 +#define UTF8PROC_BOUNDCLASS_LVT 10 + + +const char *utf8proc_version(void) { + return "1.1.5"; +} + +const char *utf8proc_errmsg(ssize_t errcode) { + switch (errcode) { + case UTF8PROC_ERROR_NOMEM: + return "Memory for processing UTF-8 data could not be allocated."; + case UTF8PROC_ERROR_OVERFLOW: + return "UTF-8 string is too long to be processed."; + case UTF8PROC_ERROR_INVALIDUTF8: + return "Invalid UTF-8 string"; + case UTF8PROC_ERROR_NOTASSIGNED: + return "Unassigned Unicode code point found in UTF-8 string."; + case UTF8PROC_ERROR_INVALIDOPTS: + return "Invalid options for UTF-8 processing chosen."; + default: + return "An unknown error occured while processing UTF-8 data."; + } +} + +ssize_t utf8proc_iterate( + const uint8_t *str, ssize_t strlen, int32_t *dst +) { + int length; + int i; + int32_t uc = -1; + *dst = -1; + if (!strlen) return 0; + length = utf8proc_utf8class[str[0]]; + if (!length) return UTF8PROC_ERROR_INVALIDUTF8; + if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8; + for (i=1; i<length; i++) { + if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8; + } + switch (length) { + case 1: + uc = str[0]; + break; + case 2: + uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F); + if (uc < 0x80) uc = -1; + break; + case 3: + uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) + + (str[2] & 0x3F); + if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) || + (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; + break; + case 4: + uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) + + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F); + if (uc < 0x10000 || uc >= 0x110000) uc = -1; + break; + } + if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) + return UTF8PROC_ERROR_INVALIDUTF8; + *dst = uc; + return length; +} + +bool utf8proc_codepoint_valid(int32_t uc) { + if (uc < 0 || uc >= 0x110000 || + ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) || + (uc >= 0xFDD0 && uc < 0xFDF0)) return false; + else return true; +} + +ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) { + if (uc < 0x00) { + return 0; + } else if (uc < 0x80) { + dst[0] = uc; + return 1; + } else if (uc < 0x800) { + dst[0] = 0xC0 + (uc >> 6); + dst[1] = 0x80 + (uc & 0x3F); + return 2; + } else if (uc == 0xFFFF) { + dst[0] = 0xFF; + return 1; + } else if (uc == 0xFFFE) { + dst[0] = 0xFE; + return 1; + } else if (uc < 0x10000) { + dst[0] = 0xE0 + (uc >> 12); + dst[1] = 0x80 + ((uc >> 6) & 0x3F); + dst[2] = 0x80 + (uc & 0x3F); + return 3; + } else if (uc < 0x110000) { + dst[0] = 0xF0 + (uc >> 18); + dst[1] = 0x80 + ((uc >> 12) & 0x3F); + dst[2] = 0x80 + ((uc >> 6) & 0x3F); + dst[3] = 0x80 + (uc & 0x3F); + return 4; + } else return 0; +} + +const utf8proc_property_t *utf8proc_get_property(int32_t uc) { + /* ASSERT: uc >= 0 && uc < 0x110000 */ + return utf8proc_properties + ( + utf8proc_stage2table[ + utf8proc_stage1table[uc >> 8] + (uc & 0xFF) + ] + ); +} + +#define utf8proc_decompose_lump(replacement_uc) \ + return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ + options & ~UTF8PROC_LUMP, last_boundclass) + +ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, + int options, int *last_boundclass) { + /* ASSERT: uc >= 0 && uc < 0x110000 */ + const utf8proc_property_t *property; + utf8proc_propval_t category; + int32_t hangul_sindex; + property = utf8proc_get_property(uc); + category = property->category; + hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; + if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { + if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { + int32_t hangul_tindex; + if (bufsize >= 1) { + dst[0] = UTF8PROC_HANGUL_LBASE + + hangul_sindex / UTF8PROC_HANGUL_NCOUNT; + if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + + (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; + } + hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; + if (!hangul_tindex) return 2; + if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; + return 3; + } + } + if (options & UTF8PROC_REJECTNA) { + if (!category) return UTF8PROC_ERROR_NOTASSIGNED; + } + if (options & UTF8PROC_IGNORE) { + if (property->ignorable) return 0; + } + if (options & UTF8PROC_LUMP) { + if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020); + if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8) + utf8proc_decompose_lump(0x0027); + if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212) + utf8proc_decompose_lump(0x002D); + if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F); + if (uc == 0x2236) utf8proc_decompose_lump(0x003A); + if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008) + utf8proc_decompose_lump(0x003C); + if (uc == 0x203A || uc == 0x232A || uc == 0x3009) + utf8proc_decompose_lump(0x003E); + if (uc == 0x2216) utf8proc_decompose_lump(0x005C); + if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303) + utf8proc_decompose_lump(0x005E); + if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD) + utf8proc_decompose_lump(0x005F); + if (uc == 0x02CB) utf8proc_decompose_lump(0x0060); + if (uc == 0x2223) utf8proc_decompose_lump(0x007C); + if (uc == 0x223C) utf8proc_decompose_lump(0x007E); + if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) { + if (category == UTF8PROC_CATEGORY_ZL || + category == UTF8PROC_CATEGORY_ZP) + utf8proc_decompose_lump(0x000A); + } + } + if (options & UTF8PROC_STRIPMARK) { + if (category == UTF8PROC_CATEGORY_MN || + category == UTF8PROC_CATEGORY_MC || + category == UTF8PROC_CATEGORY_ME) return 0; + } + if (options & UTF8PROC_CASEFOLD) { + if (property->casefold_mapping) { + const int32_t *casefold_entry; + ssize_t written = 0; + for (casefold_entry = property->casefold_mapping; + *casefold_entry >= 0; casefold_entry++) { + written += utf8proc_decompose_char(*casefold_entry, dst+written, + (bufsize > written) ? (bufsize - written) : 0, options, + last_boundclass); + if (written < 0) return UTF8PROC_ERROR_OVERFLOW; + } + return written; + } + } + if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { + if (property->decomp_mapping && + (!property->decomp_type || (options & UTF8PROC_COMPAT))) { + const int32_t *decomp_entry; + ssize_t written = 0; + for (decomp_entry = property->decomp_mapping; + *decomp_entry >= 0; decomp_entry++) { + written += utf8proc_decompose_char(*decomp_entry, dst+written, + (bufsize > written) ? (bufsize - written) : 0, options, + last_boundclass); + if (written < 0) return UTF8PROC_ERROR_OVERFLOW; + } + return written; + } + } + if (options & UTF8PROC_CHARBOUND) { + bool boundary; + int tbc, lbc; + tbc = + (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR : + (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF : + ((category == UTF8PROC_CATEGORY_ZL || + category == UTF8PROC_CATEGORY_ZP || + category == UTF8PROC_CATEGORY_CC || + category == UTF8PROC_CATEGORY_CF) && + !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL : + property->extend ? UTF8PROC_BOUNDCLASS_EXTEND : + ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) || + uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L : + (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ? + UTF8PROC_BOUNDCLASS_V : + (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ? + UTF8PROC_BOUNDCLASS_T : + (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? ( + ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ? + UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT + ) : + UTF8PROC_BOUNDCLASS_OTHER; + lbc = *last_boundclass; + boundary = + (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false : + (lbc == UTF8PROC_BOUNDCLASS_START) ? true : + (lbc == UTF8PROC_BOUNDCLASS_CR && + tbc == UTF8PROC_BOUNDCLASS_LF) ? false : + (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true : + (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true : + (lbc == UTF8PROC_BOUNDCLASS_L && + (tbc == UTF8PROC_BOUNDCLASS_L || + tbc == UTF8PROC_BOUNDCLASS_V || + tbc == UTF8PROC_BOUNDCLASS_LV || + tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : + ((lbc == UTF8PROC_BOUNDCLASS_LV || + lbc == UTF8PROC_BOUNDCLASS_V) && + (tbc == UTF8PROC_BOUNDCLASS_V || + tbc == UTF8PROC_BOUNDCLASS_T)) ? false : + ((lbc == UTF8PROC_BOUNDCLASS_LVT || + lbc == UTF8PROC_BOUNDCLASS_T) && + tbc == UTF8PROC_BOUNDCLASS_T) ? false : + true; + *last_boundclass = tbc; + if (boundary) { + if (bufsize >= 1) dst[0] = 0xFFFF; + if (bufsize >= 2) dst[1] = uc; + return 2; + } + } + if (bufsize >= 1) *dst = uc; + return 1; +} + +ssize_t utf8proc_decompose( + const uint8_t *str, ssize_t strlen, + int32_t *buffer, ssize_t bufsize, int options +) { + /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */ + ssize_t wpos = 0; + if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) + return UTF8PROC_ERROR_INVALIDOPTS; + if ((options & UTF8PROC_STRIPMARK) && + !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) + return UTF8PROC_ERROR_INVALIDOPTS; + { + int32_t uc; + ssize_t rpos = 0; + ssize_t decomp_result; + int boundclass = UTF8PROC_BOUNDCLASS_START; + while (1) { + if (options & UTF8PROC_NULLTERM) { + rpos += utf8proc_iterate(str + rpos, -1, &uc); + /* checking of return value is not neccessary, + as 'uc' is < 0 in case of error */ + if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; + if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW; + if (uc == 0) break; + } else { + if (rpos >= strlen) break; + rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc); + if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; + } + decomp_result = utf8proc_decompose_char( + uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, + &boundclass + ); + if (decomp_result < 0) return decomp_result; + wpos += decomp_result; + /* prohibiting integer overflows due to too long strings: */ + if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2) + return UTF8PROC_ERROR_OVERFLOW; + } + } + if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) { + ssize_t pos = 0; + while (pos < wpos-1) { + int32_t uc1, uc2; + const utf8proc_property_t *property1, *property2; + uc1 = buffer[pos]; + uc2 = buffer[pos+1]; + property1 = utf8proc_get_property(uc1); + property2 = utf8proc_get_property(uc2); + if (property1->combining_class > property2->combining_class && + property2->combining_class > 0) { + buffer[pos] = uc2; + buffer[pos+1] = uc1; + if (pos > 0) pos--; else pos++; + } else { + pos++; + } + } + } + return wpos; +} + +ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) { + /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored + ASSERT: 'buffer' has one spare byte of free space at the end! */ + if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { + ssize_t rpos; + ssize_t wpos = 0; + int32_t uc; + for (rpos = 0; rpos < length; rpos++) { + uc = buffer[rpos]; + if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; + if (uc == 0x000A || uc == 0x000D || uc == 0x0085 || + ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) { + if (options & UTF8PROC_NLF2LS) { + if (options & UTF8PROC_NLF2PS) { + buffer[wpos++] = 0x000A; + } else { + buffer[wpos++] = 0x2028; + } + } else { + if (options & UTF8PROC_NLF2PS) { + buffer[wpos++] = 0x2029; + } else { + buffer[wpos++] = 0x0020; + } + } + } else if ((options & UTF8PROC_STRIPCC) && + (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) { + if (uc == 0x0009) buffer[wpos++] = 0x0020; + } else { + buffer[wpos++] = uc; + } + } + length = wpos; + } + if (options & UTF8PROC_COMPOSE) { + int32_t *starter = NULL; + int32_t current_char; + const utf8proc_property_t *starter_property = NULL, *current_property; + utf8proc_propval_t max_combining_class = -1; + ssize_t rpos; + ssize_t wpos = 0; + int32_t composition; + for (rpos = 0; rpos < length; rpos++) { + current_char = buffer[rpos]; + current_property = utf8proc_get_property(current_char); + if (starter && current_property->combining_class > max_combining_class) { + /* combination perhaps possible */ + int32_t hangul_lindex; + int32_t hangul_sindex; + hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE; + if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) { + int32_t hangul_vindex; + hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; + if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { + *starter = UTF8PROC_HANGUL_SBASE + + (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) * + UTF8PROC_HANGUL_TCOUNT; + starter_property = NULL; + continue; + } + } + hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE; + if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && + (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { + int32_t hangul_tindex; + hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; + if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { + *starter += hangul_tindex; + starter_property = NULL; + continue; + } + } + if (!starter_property) { + starter_property = utf8proc_get_property(*starter); + } + if (starter_property->comb1st_index >= 0 && + current_property->comb2nd_index >= 0) { + composition = utf8proc_combinations[ + starter_property->comb1st_index + + current_property->comb2nd_index + ]; + if (composition >= 0 && (!(options & UTF8PROC_STABLE) || + !(utf8proc_get_property(composition)->comp_exclusion))) { + *starter = composition; + starter_property = NULL; + continue; + } + } + } + buffer[wpos] = current_char; + if (current_property->combining_class) { + if (current_property->combining_class > max_combining_class) { + max_combining_class = current_property->combining_class; + } + } else { + starter = buffer + wpos; + starter_property = NULL; + max_combining_class = -1; + } + wpos++; + } + length = wpos; + } + { + ssize_t rpos, wpos = 0; + int32_t uc; + for (rpos = 0; rpos < length; rpos++) { + uc = buffer[rpos]; + wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos); + } + ((uint8_t *)buffer)[wpos] = 0; + return wpos; + } +} + +ssize_t utf8proc_map( + const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options +) { + int32_t *buffer; + ssize_t result; + *dstptr = NULL; + result = utf8proc_decompose(str, strlen, NULL, 0, options); + if (result < 0) return result; + buffer = malloc(result * sizeof(int32_t) + 1); + if (!buffer) return UTF8PROC_ERROR_NOMEM; + result = utf8proc_decompose(str, strlen, buffer, result, options); + if (result < 0) { + free(buffer); + return result; + } + result = utf8proc_reencode(buffer, result, options); + if (result < 0) { + free(buffer); + return result; + } + { + int32_t *newptr; + newptr = realloc(buffer, (size_t)result+1); + if (newptr) buffer = newptr; + } + *dstptr = (uint8_t *)buffer; + return result; +} + +uint8_t *utf8proc_NFD(const uint8_t *str) { + uint8_t *retval; + utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | + UTF8PROC_DECOMPOSE); + return retval; +} + +uint8_t *utf8proc_NFC(const uint8_t *str) { + uint8_t *retval; + utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | + UTF8PROC_COMPOSE); + return retval; +} + +uint8_t *utf8proc_NFKD(const uint8_t *str) { + uint8_t *retval; + utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | + UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); + return retval; +} + +uint8_t *utf8proc_NFKC(const uint8_t *str) { + uint8_t *retval; + utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | + UTF8PROC_COMPOSE | UTF8PROC_COMPAT); + return retval; +} + Added: subversion/upstream/utf8proc/utf8proc.h URL: http://svn.apache.org/viewvc/subversion/upstream/utf8proc/utf8proc.h?rev=1405742&view=auto ============================================================================== --- subversion/upstream/utf8proc/utf8proc.h (added) +++ subversion/upstream/utf8proc/utf8proc.h Mon Nov 5 10:54:56 2012 @@ -0,0 +1,385 @@ +/* + * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + +/* + * File name: utf8proc.h + * + * Description: + * Header files for libutf8proc, which is a mapping tool for UTF-8 strings + * with following features: + * - decomposing and composing of strings + * - replacing compatibility characters with their equivalents + * - stripping of "default ignorable characters" + * like SOFT-HYPHEN or ZERO-WIDTH-SPACE + * - folding of certain characters for string comparison + * (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-") + * (see "LUMP" option) + * - optional rejection of strings containing non-assigned code points + * - stripping of control characters + * - stripping of character marks (accents, etc.) + * - transformation of LF, CRLF, CR and NEL to line-feed (LF) + * or to the unicode chararacters for paragraph separation (PS) + * or line separation (LS). + * - unicode case folding (for case insensitive string comparisons) + * - rejection of illegal UTF-8 data + * (i.e. UTF-8 encoded UTF-16 surrogates) + * - support for korean hangul characters + * Unicode Version 5.0.0 is supported. + */ + + +#ifndef UTF8PROC_H +#define UTF8PROC_H + + +#include <stdlib.h> +#include <sys/types.h> +#ifdef _MSC_VER +typedef signed char int8_t; +typedef unsigned char uint8_t; +typedef short int16_t; +typedef unsigned short uint16_t; +typedef int int32_t; +#ifdef _WIN64 +#define ssize_t __int64 +#else +#define ssize_t int +#endif +typedef unsigned char bool; +enum {false, true}; +#else +#include <stdbool.h> +#include <inttypes.h> +#endif +#include <limits.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef SSIZE_MAX +#define SSIZE_MAX ((size_t)SIZE_MAX/2) +#endif + +#define UTF8PROC_NULLTERM (1<<0) +#define UTF8PROC_STABLE (1<<1) +#define UTF8PROC_COMPAT (1<<2) +#define UTF8PROC_COMPOSE (1<<3) +#define UTF8PROC_DECOMPOSE (1<<4) +#define UTF8PROC_IGNORE (1<<5) +#define UTF8PROC_REJECTNA (1<<6) +#define UTF8PROC_NLF2LS (1<<7) +#define UTF8PROC_NLF2PS (1<<8) +#define UTF8PROC_NLF2LF (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS) +#define UTF8PROC_STRIPCC (1<<9) +#define UTF8PROC_CASEFOLD (1<<10) +#define UTF8PROC_CHARBOUND (1<<11) +#define UTF8PROC_LUMP (1<<12) +#define UTF8PROC_STRIPMARK (1<<13) +/* + * Flags being regarded by several functions in the library: + * NULLTERM: The given UTF-8 input is NULL terminated. + * STABLE: Unicode Versioning Stability has to be respected. + * COMPAT: Compatiblity decomposition + * (i.e. formatting information is lost) + * COMPOSE: Return a result with composed characters. + * DECOMPOSE: Return a result with decomposed characters. + * IGNORE: Strip "default ignorable characters" + * REJECTNA: Return an error, if the input contains unassigned + * code points. + * NLF2LS: Indicating that NLF-sequences (LF, CRLF, CR, NEL) are + * representing a line break, and should be converted to the + * unicode character for line separation (LS). + * NLF2PS: Indicating that NLF-sequences are representing a paragraph + * break, and should be converted to the unicode character for + * paragraph separation (PS). + * NLF2LF: Indicating that the meaning of NLF-sequences is unknown. + * STRIPCC: Strips and/or convers control characters. + * NLF-sequences are transformed into space, except if one of + * the NLF2LS/PS/LF options is given. + * HorizontalTab (HT) and FormFeed (FF) are treated as a + * NLF-sequence in this case. + * All other control characters are simply removed. + * CASEFOLD: Performs unicode case folding, to be able to do a + * case-insensitive string comparison. + * CHARBOUND: Inserts 0xFF bytes at the beginning of each sequence which + * is representing a single grapheme cluster (see UAX#29). + * LUMP: Lumps certain characters together + * (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-"). + * (See lump.txt for details.) + * If NLF2LF is set, this includes a transformation of + * paragraph and line separators to ASCII line-feed (LF). + * STRIPMARK: Strips all character markings + * (non-spacing, spacing and enclosing) (i.e. accents) + * NOTE: this option works only with COMPOSE or DECOMPOSE + */ + +#define UTF8PROC_ERROR_NOMEM -1 +#define UTF8PROC_ERROR_OVERFLOW -2 +#define UTF8PROC_ERROR_INVALIDUTF8 -3 +#define UTF8PROC_ERROR_NOTASSIGNED -4 +#define UTF8PROC_ERROR_INVALIDOPTS -5 +/* + * Error codes being returned by almost all functions: + * ERROR_NOMEM: Memory could not be allocated. + * ERROR_OVERFLOW: The given string is too long to be processed. + * ERROR_INVALIDUTF8: The given string is not a legal UTF-8 string. + * ERROR_NOTASSIGNED: The REJECTNA flag was set, + * and an unassigned code point was found. + * ERROR_INVALIDOPTS: Invalid options have been used. + */ + +typedef int16_t utf8proc_propval_t; +typedef struct utf8proc_property_struct { + utf8proc_propval_t category; + utf8proc_propval_t combining_class; + utf8proc_propval_t bidi_class; + utf8proc_propval_t decomp_type; + const int32_t *decomp_mapping; + unsigned bidi_mirrored:1; + int32_t uppercase_mapping; + int32_t lowercase_mapping; + int32_t titlecase_mapping; + int32_t comb1st_index; + int32_t comb2nd_index; + unsigned comp_exclusion:1; + unsigned ignorable:1; + unsigned control_boundary:1; + unsigned extend:1; + const int32_t *casefold_mapping; +} utf8proc_property_t; + +#define UTF8PROC_CATEGORY_LU 1 +#define UTF8PROC_CATEGORY_LL 2 +#define UTF8PROC_CATEGORY_LT 3 +#define UTF8PROC_CATEGORY_LM 4 +#define UTF8PROC_CATEGORY_LO 5 +#define UTF8PROC_CATEGORY_MN 6 +#define UTF8PROC_CATEGORY_MC 7 +#define UTF8PROC_CATEGORY_ME 8 +#define UTF8PROC_CATEGORY_ND 9 +#define UTF8PROC_CATEGORY_NL 10 +#define UTF8PROC_CATEGORY_NO 11 +#define UTF8PROC_CATEGORY_PC 12 +#define UTF8PROC_CATEGORY_PD 13 +#define UTF8PROC_CATEGORY_PS 14 +#define UTF8PROC_CATEGORY_PE 15 +#define UTF8PROC_CATEGORY_PI 16 +#define UTF8PROC_CATEGORY_PF 17 +#define UTF8PROC_CATEGORY_PO 18 +#define UTF8PROC_CATEGORY_SM 19 +#define UTF8PROC_CATEGORY_SC 20 +#define UTF8PROC_CATEGORY_SK 21 +#define UTF8PROC_CATEGORY_SO 22 +#define UTF8PROC_CATEGORY_ZS 23 +#define UTF8PROC_CATEGORY_ZL 24 +#define UTF8PROC_CATEGORY_ZP 25 +#define UTF8PROC_CATEGORY_CC 26 +#define UTF8PROC_CATEGORY_CF 27 +#define UTF8PROC_CATEGORY_CS 28 +#define UTF8PROC_CATEGORY_CO 29 +#define UTF8PROC_CATEGORY_CN 30 +#define UTF8PROC_BIDI_CLASS_L 1 +#define UTF8PROC_BIDI_CLASS_LRE 2 +#define UTF8PROC_BIDI_CLASS_LRO 3 +#define UTF8PROC_BIDI_CLASS_R 4 +#define UTF8PROC_BIDI_CLASS_AL 5 +#define UTF8PROC_BIDI_CLASS_RLE 6 +#define UTF8PROC_BIDI_CLASS_RLO 7 +#define UTF8PROC_BIDI_CLASS_PDF 8 +#define UTF8PROC_BIDI_CLASS_EN 9 +#define UTF8PROC_BIDI_CLASS_ES 10 +#define UTF8PROC_BIDI_CLASS_ET 11 +#define UTF8PROC_BIDI_CLASS_AN 12 +#define UTF8PROC_BIDI_CLASS_CS 13 +#define UTF8PROC_BIDI_CLASS_NSM 14 +#define UTF8PROC_BIDI_CLASS_BN 15 +#define UTF8PROC_BIDI_CLASS_B 16 +#define UTF8PROC_BIDI_CLASS_S 17 +#define UTF8PROC_BIDI_CLASS_WS 18 +#define UTF8PROC_BIDI_CLASS_ON 19 +#define UTF8PROC_DECOMP_TYPE_FONT 1 +#define UTF8PROC_DECOMP_TYPE_NOBREAK 2 +#define UTF8PROC_DECOMP_TYPE_INITIAL 3 +#define UTF8PROC_DECOMP_TYPE_MEDIAL 4 +#define UTF8PROC_DECOMP_TYPE_FINAL 5 +#define UTF8PROC_DECOMP_TYPE_ISOLATED 6 +#define UTF8PROC_DECOMP_TYPE_CIRCLE 7 +#define UTF8PROC_DECOMP_TYPE_SUPER 8 +#define UTF8PROC_DECOMP_TYPE_SUB 9 +#define UTF8PROC_DECOMP_TYPE_VERTICAL 10 +#define UTF8PROC_DECOMP_TYPE_WIDE 11 +#define UTF8PROC_DECOMP_TYPE_NARROW 12 +#define UTF8PROC_DECOMP_TYPE_SMALL 13 +#define UTF8PROC_DECOMP_TYPE_SQUARE 14 +#define UTF8PROC_DECOMP_TYPE_FRACTION 15 +#define UTF8PROC_DECOMP_TYPE_COMPAT 16 + +extern const int8_t utf8proc_utf8class[256]; + +const char *utf8proc_version(void); + +const char *utf8proc_errmsg(ssize_t errcode); +/* + * Returns a static error string for the given error code. + */ + +ssize_t utf8proc_iterate(const uint8_t *str, ssize_t strlen, int32_t *dst); +/* + * Reads a single char from the UTF-8 sequence being pointed to by 'str'. + * The maximum number of bytes read is 'strlen', unless 'strlen' is + * negative. + * If a valid unicode char could be read, it is stored in the variable + * being pointed to by 'dst', otherwise that variable will be set to -1. + * In case of success the number of bytes read is returned, otherwise a + * negative error code is returned. + */ + +bool utf8proc_codepoint_valid(int32_t uc); +/* + * Returns 1, if the given unicode code-point is valid, otherwise 0. + */ + +ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst); +/* + * Encodes the unicode char with the code point 'uc' as an UTF-8 string in + * the byte array being pointed to by 'dst'. This array has to be at least + * 4 bytes long. + * In case of success the number of bytes written is returned, + * otherwise 0. + * This function does not check if 'uc' is a valid unicode code point. + */ + +const utf8proc_property_t *utf8proc_get_property(int32_t uc); +/* + * Returns a pointer to a (constant) struct containing information about + * the unicode char with the given code point 'uc'. + * If the character is not existent a pointer to a special struct is + * returned, where 'category' is a NULL pointer. + * WARNING: The parameter 'uc' has to be in the range of 0x0000 to + * 0x10FFFF, otherwise the program might crash! + */ + +ssize_t utf8proc_decompose_char( + int32_t uc, int32_t *dst, ssize_t bufsize, + int options, int *last_boundclass +); +/* + * Writes a decomposition of the unicode char 'uc' into the array being + * pointed to by 'dst'. + * Following flags in the 'options' field are regarded: + * REJECTNA: an unassigned unicode code point leads to an error + * IGNORE: "default ignorable" chars are stripped + * CASEFOLD: unicode casefolding is applied + * COMPAT: replace certain characters with their + * compatibility decomposition + * CHARBOUND: Inserts 0xFF bytes before each grapheme cluster + * LUMP: lumps certain different characters together + * STRIPMARK: removes all character marks + * The pointer 'last_boundclass' has to point to an integer variable which + * is storing the last character boundary class, if the CHARBOUND option + * is used. + * In case of success the number of chars written is returned, + * in case of an error, a negative error code is returned. + * If the number of written chars would be bigger than 'bufsize', + * the buffer (up to 'bufsize') has inpredictable data, and the needed + * buffer size is returned. + * WARNING: The parameter 'uc' has to be in the range of 0x0000 to + * 0x10FFFF, otherwise the program might crash! + */ + +ssize_t utf8proc_decompose( + const uint8_t *str, ssize_t strlen, + int32_t *buffer, ssize_t bufsize, int options +); +/* + * Does the same as 'utf8proc_decompose_char', but acts on a whole UTF-8 + * string, and orders the decomposed sequences correctly. + * If the NULLTERM flag in 'options' is set, processing will be stopped, + * when a NULL byte is encounted, otherwise 'strlen' bytes are processed. + * The result in form of unicode code points is written into the buffer + * being pointed to by 'buffer', having the length of 'bufsize' entries. + * In case of success the number of chars written is returned, + * in case of an error, a negative error code is returned. + * If the number of written chars would be bigger than 'bufsize', + * the buffer (up to 'bufsize') has inpredictable data, and the needed + * buffer size is returned. + */ + +ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options); +/* + * Reencodes the sequence of unicode characters given by the pointer + * 'buffer' and 'length' as UTF-8. + * The result is stored in the same memory area where the data is read. + * Following flags in the 'options' field are regarded: + * NLF2LS: converts LF, CRLF, CR and NEL into LS + * NLF2PS: converts LF, CRLF, CR and NEL into PS + * NLF2LF: converts LF, CRLF, CR and NEL into LF + * STRIPCC: strips or converts all non-affected control characters + * COMPOSE: tries to combine decomposed characters into composite + * characters + * STABLE: prohibits combining characters which would violate + * the unicode versioning stability + * In case of success the length of the resulting UTF-8 string is + * returned, otherwise a negative error code is returned. + * WARNING: The amount of free space being pointed to by 'buffer', has to + * exceed the amount of the input data by one byte, and the + * entries of the array pointed to by 'str' have to be in the + * range of 0x0000 to 0x10FFFF, otherwise the program might + * crash! + */ + +ssize_t utf8proc_map( + const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options +); +/* + * Maps the given UTF-8 string being pointed to by 'str' to a new UTF-8 + * string, which is allocated dynamically, and afterwards pointed to by + * the pointer being pointed to by 'dstptr'. + * If the NULLTERM flag in the 'options' field is set, the length is + * determined by a NULL terminator, otherwise the parameter 'strlen' is + * evaluated to determine the string length, but in any case the result + * will be NULL terminated (though it might contain NULL characters + * before). Other flags in the 'options' field are passed to the functions + * defined above, and regarded as described. + * In case of success the length of the new string is returned, + * otherwise a negative error code is returned. + * NOTICE: The memory of the new UTF-8 string will have been allocated with + * 'malloc', and has theirfore to be freed with 'free'. + */ + +uint8_t *utf8proc_NFD(const uint8_t *str); +uint8_t *utf8proc_NFC(const uint8_t *str); +uint8_t *utf8proc_NFKD(const uint8_t *str); +uint8_t *utf8proc_NFKC(const uint8_t *str); +/* + * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC + * normalized version of the null-terminated string 'str'. + */ + +#ifdef __cplusplus +} +#endif + +#endif +
