Module Name: src Committed By: perseant Date: Mon Jul 31 04:29:50 UTC 2017
Modified Files: src/lib/libc/citrus [perseant-stdc-iso10646]: citrus_lc_collate.c src/lib/libc/locale [perseant-stdc-iso10646]: collate_local.h collate_locale.c unicode_collate.c unicode_nfd_qc_data.h unicode_ucd.c unicode_ucd.h Log Message: Support loading collation data from file. Began with FreeBSD's xlocale_collate, but had to change it somewhat to accommodate the requirements of the Unicode Collation Algorithm (in particular, there are maps from single-character collation elements to multiple collation weight vectors, and multiple-to-multiple mappings as well). To generate a diff of this commit: cvs rdiff -u -r1.1.2.1 -r1.1.2.2 src/lib/libc/citrus/citrus_lc_collate.c cvs rdiff -u -r1.1.2.1 -r1.1.2.2 src/lib/libc/locale/collate_local.h \ src/lib/libc/locale/collate_locale.c \ src/lib/libc/locale/unicode_collate.c \ src/lib/libc/locale/unicode_nfd_qc_data.h \ src/lib/libc/locale/unicode_ucd.c src/lib/libc/locale/unicode_ucd.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/lib/libc/citrus/citrus_lc_collate.c diff -u src/lib/libc/citrus/citrus_lc_collate.c:1.1.2.1 src/lib/libc/citrus/citrus_lc_collate.c:1.1.2.2 --- src/lib/libc/citrus/citrus_lc_collate.c:1.1.2.1 Fri Jul 14 15:53:07 2017 +++ src/lib/libc/citrus/citrus_lc_collate.c Mon Jul 31 04:29:50 2017 @@ -1,4 +1,4 @@ -/* $NetBSD: citrus_lc_collate.c,v 1.1.2.1 2017/07/14 15:53:07 perseant Exp $ */ +/* $NetBSD: citrus_lc_collate.c,v 1.1.2.2 2017/07/31 04:29:50 perseant Exp $ */ /*- * Copyright (c)2008 Citrus Project, @@ -28,7 +28,7 @@ #include <sys/cdefs.h> #if defined(LIBC_SCCS) && !defined(lint) -__RCSID("$NetBSD: citrus_lc_collate.c,v 1.1.2.1 2017/07/14 15:53:07 perseant Exp $"); +__RCSID("$NetBSD: citrus_lc_collate.c,v 1.1.2.2 2017/07/31 04:29:50 perseant Exp $"); #endif /* LIBC_SCCS and not lint */ #include "reentrant.h" @@ -65,13 +65,13 @@ __RCSID("$NetBSD: citrus_lc_collate.c,v /* * macro required by nb_lc_template(_decl).h */ -#define _CATEGORY_TYPE _CollateLocale +#define _CATEGORY_TYPE struct xlocale_collate #include "nb_lc_template_decl.h" static int _citrus_LC_COLLATE_create_impl(const char * __restrict root, - const char * __restrict name, _CollateLocale ** __restrict pdata) + const char * __restrict name, struct xlocale_collate ** __restrict pdata) { char path[PATH_MAX + 1]; int ret; @@ -92,7 +92,7 @@ _citrus_LC_COLLATE_create_impl(const cha } static __inline void -_PREFIX(update_global)(_CollateLocale *data) +_PREFIX(update_global)(struct xlocale_collate *data) { _DIAGASSERT(data != NULL); } Index: src/lib/libc/locale/collate_local.h diff -u src/lib/libc/locale/collate_local.h:1.1.2.1 src/lib/libc/locale/collate_local.h:1.1.2.2 --- src/lib/libc/locale/collate_local.h:1.1.2.1 Fri Jul 14 15:53:08 2017 +++ src/lib/libc/locale/collate_local.h Mon Jul 31 04:29:50 2017 @@ -1,26 +1,45 @@ -/* $NetBSD: collate_local.h,v 1.1.2.1 2017/07/14 15:53:08 perseant Exp $ */ +/* $NetBSD: collate_local.h,v 1.1.2.2 2017/07/31 04:29:50 perseant Exp $ */ #ifndef _COLLATE_LOCAL_H_ #define _COLLATE_LOCAL_H_ #include <locale.h> +#include "collate.h" #include "unicode_ucd.h" typedef struct _CollateLocale { void *coll_variable; size_t coll_variable_len; - struct ucd_coll *coll_data; + struct ucd_coll *coll_data; /* XXX obsolescent */ size_t coll_data_len; + const struct _FileCollateLocale *coll_fcl; +#define coll_collinfo coll_fcl->fcl_collinfo +#define coll_char_data coll_fcl->fcl_char_data + const collate_subst_t *coll_subst; + const collate_chain_t *coll_chains; + const collate_large_t *coll_large; } _CollateLocale; +typedef struct _FileCollateLocale { + collate_info_t fcl_collinfo; + collate_char_t fcl_char_data[0x100]; +/* + These fields are variable length (perhaps 0) + and follow the previous fields in the file: + collate_chain_t *chains; + collate_large_t *large; + collate_subst_t *subst; +*/ +} _FileCollateLocale; + /* * global variables */ -extern __dso_hidden const _CollateLocale _DefaultCollateLocale; +extern __dso_hidden const struct xlocale_collate _DefaultCollateLocale; __BEGIN_DECLS -int _collate_load(const char * __restrict, size_t, _CollateLocale ** __restrict); +int _collate_load(const char * __restrict, size_t, struct xlocale_collate ** __restrict); __END_DECLS #endif /* !_COLLATE_LOCAL_H_ */ Index: src/lib/libc/locale/collate_locale.c diff -u src/lib/libc/locale/collate_locale.c:1.1.2.1 src/lib/libc/locale/collate_locale.c:1.1.2.2 --- src/lib/libc/locale/collate_locale.c:1.1.2.1 Fri Jul 14 15:53:08 2017 +++ src/lib/libc/locale/collate_locale.c Mon Jul 31 04:29:50 2017 @@ -1,4 +1,4 @@ -/* $NetBSD: collate_locale.c,v 1.1.2.1 2017/07/14 15:53:08 perseant Exp $ */ +/* $NetBSD: collate_locale.c,v 1.1.2.2 2017/07/31 04:29:50 perseant Exp $ */ /*- * Copyright (c)2010 Citrus Project, * All rights reserved. @@ -25,53 +25,152 @@ * SUCH DAMAGE. */ +#ifdef COLLATION_TEST +#include "collation_test.h" +#endif + #define __SETLOCALE_SOURCE__ #include <assert.h> #include <errno.h> #include <locale.h> -#include <stdio.h> #include <stdlib.h> +#include <string.h> #include <wchar.h> +#include "collate.h" + #include "setlocale_local.h" #include "collate_local.h" -#include "unicode_collation_data.h" +#include "ducet_collation_data.h" +#ifndef COLLATION_TEST #include "citrus_module.h" +#endif + +const struct collate_info _DefaultCollateInfo = { + 4, /* uint8_t directive_count; */ + {}, /* uint8_t directive[COLL_WEIGHTS_MAX]; */ + {}, /* int32_t pri_count[COLL_WEIGHTS_MAX]; */ + 0, /* int32_t flags; */ + DUCET_COLLATE_CHAINS_LENGTH, /* int32_t chain_count; */ + DUCET_COLLATE_LARGE_LENGTH, /* int32_t large_count; */ + { 0 }, /* int32_t subst_count[COLL_WEIGHTS_MAX]; */ + { 0 }, /* int32_t undef_pri[COLL_WEIGHTS_MAX]; */ + DUCET_COLLATE_RCHAINS_LENGTH, /* int32_t rchain_count; */ + DUCET_COLLATE_DCHAINS_LENGTH, /* int32_t rchain_count; */ +}; -const _CollateLocale _DefaultCollateLocale = { - __UNCONST("DUCET"), - 5, - &ucd_collate_data[0], - UCD_COLLATE_DATA_LENGTH +const struct xlocale_collate _DefaultCollateLocale = { + 0, /* int __collate_load_error; */ + NULL, /* char * map; */ + 0, /* size_t maplen; */ + + &_DefaultCollateInfo, /* collate_info_t *info; */ + ducet_collate_chars, /* collate_char_t *char_pri_table; */ + ducet_collate_large, /* collate_large_t *large_pri_table; */ + ducet_collate_chains, /* collate_chain_t *chain_pri_table; */ + ducet_collate_rchains, /* collate_rchain_t *rchain_pri_table; */ + ducet_collate_dchains, /* collate_dchain_t *dchain_pri_table; */ + NULL, /* collate_subst_t *subst_table[COLL_WEIGHTS_MAX]; */ }; +static int +_collate_read_file(const char * __restrict, size_t, + struct xlocale_collate ** __restrict); + int _collate_load(const char * __restrict var, size_t lenvar, - _CollateLocale ** __restrict prl) + struct xlocale_collate ** __restrict prl) { - int ret; - _DIAGASSERT(var != NULL || lenvar < 1); _DIAGASSERT(prl != NULL); - if (lenvar < 1) + if (lenvar < COLLATE_STR_LEN) return EFTYPE; - switch (*var) { - case 'U': -#ifdef notyet - ret = _collate_read_file(var, lenvar, prl); -#else - *prl = (_CollateLocale *)malloc(sizeof(**prl)); - (*prl)->coll_variable = __UNCONST("FAKE"); - (*prl)->coll_variable_len = 4; - (*prl)->coll_data = (struct ucd_coll *)malloc(sizeof(struct ucd_coll)); - (*prl)->coll_data_len = 0; - ret = 0; -#endif - break; - default: - ret = EFTYPE; + + if (strncmp(var, COLLATE_VERSION, COLLATE_STR_LEN)) + return EFTYPE; + + var += COLLATE_STR_LEN; + lenvar -= COLLATE_STR_LEN; + + return _collate_read_file(var, lenvar, prl); } - return ret; + +static int +_collate_read_file(const char * __restrict var, size_t lenvar, + struct xlocale_collate ** __restrict prl) +{ + struct xlocale_collate *clp; + size_t section_length; + char *ci; + + if (lenvar < sizeof(struct collate_info)) + return EFTYPE; + + clp = (struct xlocale_collate *)malloc(sizeof(*clp)); + ci = (char *)malloc(lenvar); + memcpy(ci, var, lenvar); + + /* File header */ + clp->info = (const struct collate_info *)ci; + ci += sizeof(*clp->info); + lenvar -= sizeof(*clp->info); + + /* Table of narrow character priorities */ + clp->char_pri_table = (const collate_char_t *)ci; + section_length = 0x80 * sizeof(collate_chain_t); + if (lenvar < section_length) + goto errout; + ci += section_length; + lenvar -= section_length; + + /* Collation elements ("chains") */ + clp->chain_pri_table = (const collate_chain_t *)ci; + section_length = clp->info->chain_count * sizeof(collate_chain_t); + if (lenvar < section_length) + goto errout; + ci += section_length; + lenvar -= section_length; + + /* Collation weights for characters > 0x80 */ + clp->large_pri_table = (const collate_large_t *)ci; + section_length = clp->info->large_count * sizeof(collate_large_t); + if (lenvar < section_length) + goto errout; + ci += section_length; + lenvar -= section_length; + +#if 0 + /* Substitutions */ + clp->subst_table = (collate_subst_t *)ci; + section_length = clp->info->subst_count * sizeof(collate_subst_t); + if (lenvar < section_length) + goto errout; + ci += section_length; + lenvar -= section_length; +#endif + + /* Characters that have more than one associated weight (reverse chains) */ + clp->rchain_pri_table = (const collate_rchain_t *)ci; + section_length = clp->info->rchain_count * sizeof(collate_rchain_t); + if (lenvar < section_length) + goto errout; + ci += section_length; + lenvar -= section_length; + + /* Double chains (>1 char to >1 weight mapping) */ + clp->dchain_pri_table = (const collate_dchain_t *)ci; + section_length = clp->info->dchain_count * sizeof(collate_dchain_t); + if (lenvar < section_length) + goto errout; + ci += section_length; + lenvar -= section_length; + + *prl = clp; + return 0; + +errout: + free(clp); + return EFTYPE; } Index: src/lib/libc/locale/unicode_collate.c diff -u src/lib/libc/locale/unicode_collate.c:1.1.2.1 src/lib/libc/locale/unicode_collate.c:1.1.2.2 --- src/lib/libc/locale/unicode_collate.c:1.1.2.1 Fri Jul 14 15:53:08 2017 +++ src/lib/libc/locale/unicode_collate.c Mon Jul 31 04:29:50 2017 @@ -27,7 +27,12 @@ * SUCH DAMAGE. */ +#ifdef COLLATION_TEST +#include "collation_test.h" +#endif /* COLLATION_TEST */ + #include <sys/queue.h> +#include <stdio.h> #include <stdlib.h> #include <locale.h> #include <string.h> @@ -50,7 +55,7 @@ int wcscoll_l(const wchar_t *s1, const wchar_t *s2, locale_t loc) { - if (loc->part_impl[LC_COLLATE] == _lc_C_locale.part_impl[LC_COLLATE]) + if (loc && loc->part_impl[LC_COLLATE] == _lc_C_locale.part_impl[LC_COLLATE]) return wcscmp(s1, s2); return unicode_wcscoll_l(s1, s2, loc); } @@ -120,19 +125,21 @@ unicode_wcstokey(uint16_t **key, size_t *keylen = clen0; #ifdef DEBUG_FINAL_WEIGHTS - fprintf(stderr, " final weights:"); - for (ret = 0; (size_t)ret < *keylen; ++ret) { - if ((*key)[ret] == 0x0) - fprintf(stderr, " |"); + printf(" final weights:"); + for (clen0 = 0; (size_t)clen0 < *keylen; ++clen0) { + if ((*key)[clen0] == 0x0) + printf(" |"); else - fprintf(stderr, " %4.4hx", (*key)[ret]); + printf(" %4.4hx", (*key)[clen0]); } - fprintf(stderr, "\n"); + printf("\n"); #endif /* Free the tailq */ TAILQ_FOREACH_SAFE(entry, &c_arg0, tailq, tentry) { TAILQ_REMOVE(&c_arg0, entry, tailq); + if (entry->flags & CE_FLAGS_NEEDSFREE) + free(__UNCONST(entry->pri)); free(entry); } TAILQ_INIT(&c_arg0); @@ -265,7 +272,6 @@ unicode_wcscoll_l(const wchar_t *arg0, c * of the original strings. */ if (ret == 0) { - fprintf(stderr, " tie breaker: "); fprint_unicode_string(stderr, (m_arg0 ? m_arg0 : arg0)); fprintf(stderr, " vs "); fprint_unicode_string(stderr, (m_arg1 ? m_arg1 : arg1)); @@ -286,7 +292,7 @@ unicode_wcscoll_l(const wchar_t *arg0, c static wchar_t * unicode_recursive_decompose(const wchar_t *arg0, wchar_t **m_arg0) { - wchar_t *wcp, *dst, *decomposition, *targ0; + wchar_t *wcp, *dst, *decomposition, *targ0, *dend; int recurse = 0; int swap_again = 0; int max_factor = 1, n = 0; @@ -330,16 +336,21 @@ unicode_recursive_decompose(const wchar_ wcscpy(dst, targ0); } else { for (wcp = targ0, dst = *m_arg0; *wcp; ++wcp) { - if (unicode_get_nfd_qc(*wcp) > 0) + if (*wcp < 0x80) + decomposition = NULL; + if (unicode_get_nfd_qc(*wcp) > 0) { decomposition = NULL; - else + } else decomposition = unicode_decomposition(*wcp, &n); if (decomposition == NULL) { *dst++ = *wcp; } else { - while (*decomposition) { - int n2 = 0; - if (unicode_decomposition(*decomposition, &n2) != NULL) { + dend = decomposition + n; + while (decomposition < dend) { + /* int n2 = 0; */ + if (*decomposition >= 0x80 + && unicode_get_nfd_qc(*decomposition) == 0 + /* && unicode_decomposition(*decomposition, &n2) != NULL */) { recurse = 1; } *dst++ = *decomposition++; @@ -396,30 +407,34 @@ static uint16_t * unicode_coll2numkey(struct collation_set *cc, int *lenp, locale_t loc) { struct collation_element *ccp; - weight_t val; + int val; uint16_t *ret, *rp; int max_level = COLLATE_MAX_LEVEL(loc) - 1; /* 2 or 3 */ - int shift, count; + int i, j, count; /* Count the weights */ count = 0; TAILQ_FOREACH(ccp, cc, tailq) { - count++; + count += ccp->len; } - rp = ret = (uint16_t *)calloc((count + 1) * (max_level + 1), sizeof(*rp)); - for (shift = 16 * max_level; shift >= 0; shift -= 16) { - if (COLLATE_FLAGS(loc, level) & COLL_BACKWARDS) { + rp = ret = (uint16_t *)calloc(2 * (count + 1) * (max_level + 1), sizeof(*rp)); + for (i = 0; i < COLL_WEIGHTS_MAX; i++) { + if (loc && _COLLATE_LOCALE(loc)->info->directive[i] & COLL_BACKWARDS) { TAILQ_FOREACH_REVERSE(ccp, cc, collation_set, tailq) { - val = (ccp->weight >> shift) & 0xffff; - if (val != 0) { + for (j = ccp->len - 1; j >= 0; --j) { + val = ccp->pri[j * COLL_WEIGHTS_MAX + i]; + if (val != 0) *rp++ = val; } } } else { TAILQ_FOREACH(ccp, cc, tailq) { - val = (ccp->weight >> shift) & 0xffff; - if (val != 0) { + if (ccp->pri == NULL) + continue; + for (j = 0; j < ccp->len; j++) { + val = ccp->pri[j * COLL_WEIGHTS_MAX + i]; + if (val != 0) *rp++ = val; } } @@ -427,6 +442,7 @@ unicode_coll2numkey(struct collation_set /* Between each level we place a zero weight marker. */ *rp++ = 0; } + *lenp = rp - ret; return ret; } Index: src/lib/libc/locale/unicode_nfd_qc_data.h diff -u src/lib/libc/locale/unicode_nfd_qc_data.h:1.1.2.1 src/lib/libc/locale/unicode_nfd_qc_data.h:1.1.2.2 --- src/lib/libc/locale/unicode_nfd_qc_data.h:1.1.2.1 Fri Jul 14 15:53:08 2017 +++ src/lib/libc/locale/unicode_nfd_qc_data.h Mon Jul 31 04:29:50 2017 @@ -1,3 +1,5 @@ +#define UCD_NFD_QC_DATA_LENGTH 13232 + static struct ucd_nfd_qc ucd_nfd_qc_data[] = { { 0x000C0, 0x0 }, { 0x000C1, 0x0 }, Index: src/lib/libc/locale/unicode_ucd.c diff -u src/lib/libc/locale/unicode_ucd.c:1.1.2.1 src/lib/libc/locale/unicode_ucd.c:1.1.2.2 --- src/lib/libc/locale/unicode_ucd.c:1.1.2.1 Fri Jul 14 15:53:08 2017 +++ src/lib/libc/locale/unicode_ucd.c Mon Jul 31 04:29:50 2017 @@ -27,13 +27,19 @@ * SUCH DAMAGE. */ +#ifdef COLLATION_TEST +#include "collation_test.h" +#include <stdio.h> +#endif + #include <assert.h> #include <stdlib.h> #include <sys/types.h> #include <locale.h> #include <wchar.h> #include "unicode_ucd.h" -#include "unicode_decomp_data.h" +#include "unicode_dm_data.h" +/* #include "unicode_decomp_data.h" */ /* #include "unicode_collation_data.h" - DUCET */ #include "unicode_ccc_data.h" #include "unicode_nfd_qc_data.h" @@ -43,31 +49,22 @@ #include "setlocale_local.h" #include "collate_local.h" -static const struct ucd_coll * ucd_get_collation_element(wchar_t *, const _CollateLocale *, int *, int); +static int ucd_get_collation_element(wchar_t *, const struct xlocale_collate *, struct collation_element *, int); -#define USE_NFD_QC_HASH_TABLE +/* #define USE_NFD_QC_HASH_TABLE */ #define USE_CCC_HASH_TABLE -#define USE_DECOMP_HASH_TABLE /* #define USE_COLL_HASH_TABLE */ +#ifdef USE_NFD_QC_HASH_TABLE #define NFD_QC_WIDTH 1024 #define NFD_QC_HASH(x) (((x) >> 2) & 0x3ff) static struct ucd_nfd_qc_head *nfd_qc_hash; +#endif #define CCC_WIDTH 1024 #define CCC_HASH(x) (((x) >> 2) & 0x3ff) static struct ucd_ccc_head *ccc_hash; -#define DECOMP_WIDTH 1024 -#define DECOMP_HASH(x) (((x) >> 2) & 0x3ff) -static struct ucd_decomp_head *decomp_hash; - -#ifdef USE_COLL_HASH_TABLE -#define COLL_WIDTH 1024 -#define COLL_HASH(x) (((x) >> 2) & 0x3ff) -static struct ucd_coll_head *coll_hash; -#endif - #define RESV_CP_WIDTH 1024 #define RESV_CP_HASH(x) (((x) >> 2) & 0x3ff) static struct ucd_resv_cp_head *resv_cp_hash; @@ -77,8 +74,8 @@ static struct ucd_resv_range_head resv_r uint8_t unicode_get_nfd_qc(wchar_t wc) { int dfl = 1; - struct ucd_nfd_qc *ccp; #ifdef USE_NFD_QC_HASH_TABLE + struct ucd_nfd_qc *ccp; int i; if (nfd_qc_hash == NULL) { @@ -99,13 +96,22 @@ uint8_t unicode_get_nfd_qc(wchar_t wc) } } #else - for (ccp = ucd_nfd_qc_data; ccp->cp; ++ccp) { - if (ccp->cp > wc) { - return dfl; - } - if (ccp->cp == wc) { - return ccp->nfd_qc; - } + int l, r, m; + wchar_t u; + + /* Try single substitutions first */ + l = 0; + r = UCD_NFD_QC_DATA_LENGTH - 1; + while (l <= r) { + m = (l + r) / 2; + u = ucd_nfd_qc_data[m].cp; + if (u == wc) { + return ucd_nfd_qc_data[m].nfd_qc; + } + if (u < wc) + l = m + 1; + else + r = m - 1; } #endif /* USE_NFD_QC_HASH_TABLE */ return dfl; @@ -127,9 +133,11 @@ uint8_t unicode_get_ccc(wchar_t wc) } SLIST_FOREACH(ccp, &ccc_hash[CCC_HASH(wc)], hash_next) { +#if 0 if (ccp->cp < wc) { return 0; } +#endif if (ccp->cp == wc) { return ccp->ccc; } @@ -153,133 +161,252 @@ uint8_t unicode_get_ccc(wchar_t wc) */ wchar_t *unicode_decomposition(wchar_t wc, int *np) { - struct ucd_decomp *ccp; -#ifdef USE_DECOMP_HASH_TABLE - int i; + int l, r, m; + wchar_t u; - if (decomp_hash == NULL) { - decomp_hash = (struct ucd_decomp_head *)calloc(DECOMP_WIDTH, sizeof(*decomp_hash)); - for (i = 0; i < DECOMP_WIDTH; i++) - SLIST_INIT(decomp_hash + i); - for (ccp = ucd_decomp_data; ccp->cp; ++ccp) { - SLIST_INSERT_HEAD(&decomp_hash[DECOMP_HASH(ccp->cp)], ccp, hash_next); - } - } - - SLIST_FOREACH(ccp, &decomp_hash[DECOMP_HASH(wc)], hash_next) { - if (ccp->cp < wc) { - return NULL; - } - if (ccp->cp == wc) { - *np = ccp->n; - return (wchar_t *)ccp->dm; - } + /* Try single substitutions first */ + l = 0; + r = UCD_DECOMP_SINGLE_DATA_LENGTH - 1; + while (l <= r) { + m = (l + r) / 2; + u = ucd_decomp_single_data[m].cp; + if (u == wc) { + *np = 1; + return &ucd_decomp_single_data[m].dm; + } + if (u < wc) + l = m + 1; + else + r = m - 1; } -#else /* ! USE_DECOMP_HASH_TABLE */ - for (ccp = ucd_decomp_data; ccp->cp; ++ccp) { - if (ccp->cp > wc) - return NULL; - if (ccp->cp == wc) { - *np = ccp->n; - return (wchar_t *)ccp->dm; + + /* Then pairs */ + l = 0; + r = UCD_DECOMP_PAIR_DATA_LENGTH - 1; + while (l <= r) { + m = (l + r) / 2; + u = ucd_decomp_pair_data[m].cp; + if (u == wc) { + *np = 2; + return ucd_decomp_pair_data[m].dm; + } + if (u < wc) + l = m + 1; + else + r = m - 1; + } + + /* Then the rest */ + l = 0; + r = UCD_DECOMP_MISC_DATA_LENGTH - 1; + while (l <= r) { + m = (l + r) / 2; + u = ucd_decomp_misc_data[m].cp; + if (u == wc) { + *np = ucd_decomp_misc_data[m].n; + return ucd_decomp_misc_data[m].dm; + } + if (u < wc) + l = m + 1; + else + r = m - 1; } - } -#endif /* ! USE_DECOMP_HASH_TABLE */ return NULL; -} + } -static const struct ucd_coll * -ucd_get_collation_element(wchar_t *wcp, const _CollateLocale *cloc, int *np, int do_ccc) +int +ucd_get_collation_element(wchar_t *wcp, const struct xlocale_collate *xc, struct collation_element *out, int do_ccc) { - const struct ucd_coll *cc, *best_cc = NULL, *sc; -#ifdef USE_COLL_HASH_TABLE - struct ucd_coll *ccp; -#endif - wchar_t *wcccp, tmp; - int best_len = 0; - int i, match; + struct collation_element sc; + wchar_t *wcccp, tmp, u; + size_t i, j, sclen, best = 0; uint8_t ccc, this_ccc; + int m, l, r; -#ifdef USE_COLL_HASH_TABLE - if (coll_hash == NULL) { - coll_hash = (struct ucd_coll_head *)calloc(COLL_WIDTH, sizeof(*coll_hash)); - for (i = 0; i < COLL_WIDTH; i++) - SLIST_INIT(coll_hash + i); - for (ccp = ucd_collate_data; ccp->cp < 0x0FFFFFFF; ++ccp) { - wchar_t wc = ccp->cp; - if (wc == 0x0) - wc = ccp->cpp[0]; - SLIST_INSERT_HEAD(&coll_hash[COLL_HASH(wc)], ccp, hash_next); + sc.flags = 0x0; + out->pri = NULL; + out->len = 1; + + if (xc != NULL) { + /* Find the index of the leading character, if it appears */ + l = m = 0; + r = xc->info->chain_count - 1; + while (l <= r) { + m = (l + r) / 2; + u = xc->chain_pri_table[m].str[0]; + if (u == wcp[0]) { + break; + } + if (u < wcp[0]) + l = m + 1; + else + r = m - 1; + } + + /* Find the first element that begins with wcp[0] */ + while (wcp[0] == xc->chain_pri_table[m].str[0]) + --m; + ++m; + + if (wcp[0] == xc->chain_pri_table[m].str[0]) { + const wchar_t * pm = xc->chain_pri_table[m].str; + /* There is at least one matching chain. */ + /* Walk through all the possibilities. */ + while (wcp[0] == pm[0]) { + pm = xc->chain_pri_table[m].str; + /* Check that this element matches */ + for (i = 0; pm[i]; i++) { + if (wcp[i] != pm[i]) + break; + } + if (pm[i] == 0) { + /* It matches */ + if (i > best) { + out->pri = xc->chain_pri_table[m].pri; + best = i; } } + ++m; + } +} - SLIST_FOREACH(cc, &coll_hash[COLL_HASH(*wcp)], hash_next) { -#else - for (cc = cloc->coll_data; cc->cp < 0x0FFFFFFF; ++cc) { - if (cc->cp > wcp[0]) { + /* Check double chains. */ + if (best == 0) { + l = 0; + r = xc->info->dchain_count - 1; + while (l <= r) { + m = (l + r) / 2; + u = xc->dchain_pri_table[m].str[0]; + if (u == wcp[0]) { + break; + } + if (u < wcp[0]) + l = m + 1; + else + r = m - 1; + } + /* Find the first element that begins with wcp[0] */ + while (wcp[0] == xc->dchain_pri_table[m].str[0]) + --m; + ++m; + + if (wcp[0] == xc->dchain_pri_table[m].str[0]) { + const wchar_t * pm = xc->dchain_pri_table[m].str; + /* There is at least one matching chain. */ + /* Walk through all the possibilities. */ + while (wcp[0] == pm[0]) { + pm = xc->dchain_pri_table[m].str; + /* Check that this element matches */ + for (i = 0; pm[i]; i++) { + if (wcp[i] != pm[i]) break; } -#endif + if (pm[i] == 0) { + /* It matches */ + if (i > best) { + out->pri = (weight_t *)xc->dchain_pri_table[m].pri; + for (j = 0; j < COLLATE_STR_LEN && xc->dchain_pri_table[m].pri[j][0] != (weight_t)0xFFFFFFFF; ++j) + ; + out->len = j; + best = i; + } + } + ++m; + } + } + } - /* - * Multi-character matches - */ - if (cc->cp == 0x0) { - match = 1; - for (i = 0; cc->cpp[i] != 0x0; ++i) { - if (cc->cpp[i] != wcp[i]) { - match = 0; + if (best == 0) { + /* There is no matching chain. Look up individual weights. */ + if (wcp[0] < 0x80) { + out->pri = xc->char_pri_table[wcp[0]].pri; + best = 1; + } else { + /* Now look at large matches */ + l = 0; + r = xc->info->large_count; + while (l <= r) { + m = (l + r) / 2; + u = xc->large_pri_table[m].val; + if (u == wcp[0]) { + out->pri = xc->large_pri_table[m].pri; + best = 1; break; } + if (u < wcp[0]) + l = m + 1; + else + r = m - 1; } - if (match && i > best_len) { - best_cc = cc; - best_len = i; } - } else if (cc->cp == wcp[0] && best_len == 0) { - /* Single-character match */ - best_cc = cc; - best_len = 1; + } + + /* Check reverse chains. */ + if (best == 0) { + l = 0; + r = xc->info->rchain_count; + while (l <= r) { + m = (l + r) / 2; + u = xc->rchain_pri_table[m].val; + if (u == wcp[0]) { + out->pri = (weight_t *)xc->rchain_pri_table[m].pri; + for (i = 0; i < COLLATE_STR_LEN && xc->rchain_pri_table[m].pri[i][0] != (weight_t)0xFFFFFFFF; ++i) + ; + out->len = i; + best = 1; + break; + } + if (u < wcp[0]) + l = m + 1; + else + r = m - 1; } } - if (best_cc != NULL && do_ccc) { + if (best > 0 && do_ccc) { /* - * We have found the longest string S that matches in the table. - * Now let the set of unblocked non-starters immediately following S - * be called CC. For any C in CC, if S+C has a match in the table, - * replace S with S+C and remove C from CC. Repeat as long as we - * are expanding S. Then copy the collation weights. + * We have found the longest string S that matches in + * the table. Now let the set of unblocked + * non-starters immediately following S be called CC. + * For any C in CC, if S+C has a match in the table, + * replace S with S+C and remove C from CC. Repeat as + * long as we are expanding S. Then copy the + * collation weights. */ ccc = 0xff; - for (wcccp = wcp + best_len; *wcccp; ++wcccp) { + for (wcccp = wcp + best; *wcccp; ++wcccp) { this_ccc = unicode_get_ccc(*wcccp); - if (this_ccc == 0 || this_ccc > ccc) + if (this_ccc == 0) break; - /* Swap this and wcp + best_len, and look for a match */ + if (this_ccc == ccc) + continue; + ccc = this_ccc; + + /* Swap this and wcp + best, and look for a match */ tmp = *wcccp; - *wcccp = *(wcp + best_len); - *(wcp + best_len) = tmp; - sc = ucd_get_collation_element(wcp, cloc, &best_len, 0); - if (sc != best_cc) { - best_cc = sc; + *wcccp = *(wcp + best); + *(wcp + best) = tmp; + sclen = ucd_get_collation_element(wcp, xc, &sc, 0); + if (sclen > best) { + *out = sc; /* Structure copy */ + best = sclen; /* Back up and start over */ - wcccp = wcp + best_len; + wcccp = wcp + best; } else { - /* swap back - best_len should be unchanged */ + /* swap back */ tmp = *wcccp; - *wcccp = *(wcp + best_len); - *(wcp + best_len) = tmp; + *wcccp = *(wcp + best); + *(wcp + best) = tmp; + } } } } /* Fall back to DUCET if we don't find anything in the locale */ - if (best_len == 0 && cloc != &_DefaultCollateLocale) - return ucd_get_collation_element(wcp, &_DefaultCollateLocale, np, do_ccc); + if (best == 0 && xc != &_DefaultCollateLocale) + return ucd_get_collation_element(wcp, &_DefaultCollateLocale, out, do_ccc); - *np = best_len; - return best_cc; + return best; } #define UNASSIGNED_CACHE_WIDTH 16 @@ -402,9 +529,6 @@ ucd_check_unassigned(wchar_t wc) return unassigned; } -#define _COLLATE_LOCALE(loc) \ - ((_CollateLocale *)((loc)->part_impl[(size_t)LC_COLLATE])) - /* * Add the collation element sequence that corresponds with * the beginning of the given string to head. @@ -412,20 +536,27 @@ ucd_check_unassigned(wchar_t wc) */ int unicode_get_collation_element(struct collation_set *head, wchar_t *wcp, locale_t loc) { - const struct ucd_coll *cc; - const weight_t *ce; - struct collation_element *new_element; - struct ucd_coll lcc; - int n, unassigned; - _CollateLocale *cloc = _COLLATE_LOCALE(loc); - - cc = ucd_get_collation_element(wcp, cloc, &n, 1); - - if (cc == NULL) { + struct collation_element *cc; + int n, unassigned, len; + struct xlocale_collate *cloc; + weight_t *wp; + weight_t aaaa = 0x0, bbbb = 0x0; + + cc = (struct collation_element *)malloc(sizeof(*cc)); + cc->flags = 0x0; + + if (loc == NULL) + cloc = NULL; + else + cloc = _COLLATE_LOCALE(loc); + len = ucd_get_collation_element(wcp, cloc, cc, 1); + + if (len) + n = len; + else { unassigned = ucd_check_unassigned(*wcp); /* If we couldn't find it, supply implicit weights. */ - weight_t aaaa = 0x0, bbbb = 0x0; if (!unassigned) { if ((*wcp >= 0x17000 && *wcp <= 0x187FF) /* Tangut block */ || (*wcp >= 0x18800 && *wcp <= 0x18AFF)) /* Tangut_Components block */ @@ -467,18 +598,18 @@ int unicode_get_collation_element(struct bbbb = (*wcp & 0x7FFF) | 0x8000; } - lcc.ce[0] = (aaaa << 32) | 0x00200002; - lcc.ce[1] = (bbbb << 32) | 0x0; - lcc.ce[2] = 0x0; - cc = &lcc; + wp = (weight_t *)calloc(2 * COLL_WEIGHTS_MAX, sizeof(weight_t)); + wp[0] = aaaa; + wp[1] = 0x0020; + wp[2] = 0x0002; + wp[1 * COLL_WEIGHTS_MAX + 0] = bbbb; + cc->pri = (const weight_t *)wp; + cc->len = 2; + cc->flags |= CE_FLAGS_NEEDSFREE; n = 1; } - for (ce = cc->ce; *ce; ++ce) { - new_element = (struct collation_element *)malloc(sizeof(*new_element)); - new_element->weight = *ce; - TAILQ_INSERT_TAIL(head, new_element, tailq); - } + TAILQ_INSERT_TAIL(head, cc, tailq); return n; } Index: src/lib/libc/locale/unicode_ucd.h diff -u src/lib/libc/locale/unicode_ucd.h:1.1.2.1 src/lib/libc/locale/unicode_ucd.h:1.1.2.2 --- src/lib/libc/locale/unicode_ucd.h:1.1.2.1 Fri Jul 14 15:53:08 2017 +++ src/lib/libc/locale/unicode_ucd.h Mon Jul 31 04:29:50 2017 @@ -30,28 +30,54 @@ #ifndef _UNICODE_UCD_H #define _UNICODE_UCD_H +#ifdef COLLATION_TEST +#include "collation_test.h" +#endif + #include <sys/types.h> #include <sys/queue.h> #include <locale.h> +#include "collate_local.h" +#include "collate.h" + typedef int32_t ucd_codepoint_t; -typedef uint64_t weight_t; TAILQ_HEAD(collation_set, collation_element); SLIST_HEAD(ucd_nfd_qc_head, ucd_nfd_qc); SLIST_HEAD(ucd_ccc_head, ucd_ccc); -SLIST_HEAD(ucd_decomp_head, ucd_decomp); SLIST_HEAD(ucd_coll_head, ucd_coll); SLIST_HEAD(ucd_resv_cp_head, ucd_resv_cp); SLIST_HEAD(ucd_resv_range_head, ucd_resv_range); +#define _COLLATE_LOCALE(loc) \ + ((struct xlocale_collate *)((loc)->part_impl[(size_t)LC_COLLATE])) + /* Data definition */ +#if 0 struct ucd_decomp { ucd_codepoint_t cp; int n; ucd_codepoint_t dm[20]; /* At present, no more than 18 */ SLIST_ENTRY(ucd_decomp) hash_next; }; +#endif + +struct ucd_decomp_single { + ucd_codepoint_t cp; + ucd_codepoint_t dm; +}; + +struct ucd_decomp_pair { + ucd_codepoint_t cp; + ucd_codepoint_t dm[2]; +}; + +struct ucd_decomp_misc { + ucd_codepoint_t cp; + int n; + ucd_codepoint_t dm[20]; /* At present, no more than 18 */ +}; struct ucd_ccc { ucd_codepoint_t cp; @@ -65,13 +91,6 @@ struct ucd_nfd_qc { SLIST_ENTRY(ucd_nfd_qc) hash_next; }; -struct ucd_coll { - ucd_codepoint_t cp; - ucd_codepoint_t cpp[5]; /* XXX */ - weight_t ce[20]; /* XXX */ - SLIST_ENTRY(ucd_coll) hash_next; -}; - struct ucd_resv_cp { ucd_codepoint_t cp; SLIST_ENTRY(ucd_resv_cp) hash_next; @@ -84,7 +103,10 @@ struct ucd_resv_range { }; struct collation_element { - weight_t weight; + const weight_t *pri; + int len; +#define CE_FLAGS_NEEDSFREE 0x00000001 + u_int32_t flags; TAILQ_ENTRY(collation_element) tailq; };