Module Name:    src
Committed By:   perseant
Date:           Mon Jul 31 04:29:50 UTC 2017

Modified Files:
        src/lib/libc/citrus [perseant-stdc-iso10646]: citrus_lc_collate.c
        src/lib/libc/locale [perseant-stdc-iso10646]: collate_local.h
            collate_locale.c unicode_collate.c unicode_nfd_qc_data.h
            unicode_ucd.c unicode_ucd.h

Log Message:
Support loading collation data from file.  Began with FreeBSD's
xlocale_collate, but had to change it somewhat to accommodate the
requirements of the Unicode Collation Algorithm (in particular,
there are maps from single-character collation elements to
multiple collation weight vectors, and multiple-to-multiple
mappings as well).


To generate a diff of this commit:
cvs rdiff -u -r1.1.2.1 -r1.1.2.2 src/lib/libc/citrus/citrus_lc_collate.c
cvs rdiff -u -r1.1.2.1 -r1.1.2.2 src/lib/libc/locale/collate_local.h \
    src/lib/libc/locale/collate_locale.c \
    src/lib/libc/locale/unicode_collate.c \
    src/lib/libc/locale/unicode_nfd_qc_data.h \
    src/lib/libc/locale/unicode_ucd.c src/lib/libc/locale/unicode_ucd.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/lib/libc/citrus/citrus_lc_collate.c
diff -u src/lib/libc/citrus/citrus_lc_collate.c:1.1.2.1 src/lib/libc/citrus/citrus_lc_collate.c:1.1.2.2
--- src/lib/libc/citrus/citrus_lc_collate.c:1.1.2.1	Fri Jul 14 15:53:07 2017
+++ src/lib/libc/citrus/citrus_lc_collate.c	Mon Jul 31 04:29:50 2017
@@ -1,4 +1,4 @@
-/* $NetBSD: citrus_lc_collate.c,v 1.1.2.1 2017/07/14 15:53:07 perseant Exp $ */
+/* $NetBSD: citrus_lc_collate.c,v 1.1.2.2 2017/07/31 04:29:50 perseant Exp $ */
 
 /*-
  * Copyright (c)2008 Citrus Project,
@@ -28,7 +28,7 @@
 
 #include <sys/cdefs.h>
 #if defined(LIBC_SCCS) && !defined(lint)
-__RCSID("$NetBSD: citrus_lc_collate.c,v 1.1.2.1 2017/07/14 15:53:07 perseant Exp $");
+__RCSID("$NetBSD: citrus_lc_collate.c,v 1.1.2.2 2017/07/31 04:29:50 perseant Exp $");
 #endif /* LIBC_SCCS and not lint */
 
 #include "reentrant.h"
@@ -65,13 +65,13 @@ __RCSID("$NetBSD: citrus_lc_collate.c,v 
 /*
  * macro required by nb_lc_template(_decl).h
  */
-#define _CATEGORY_TYPE          _CollateLocale
+#define _CATEGORY_TYPE          struct xlocale_collate
 
 #include "nb_lc_template_decl.h"
 
 static int
 _citrus_LC_COLLATE_create_impl(const char * __restrict root,
-    const char * __restrict name, _CollateLocale ** __restrict pdata)
+    const char * __restrict name, struct xlocale_collate ** __restrict pdata)
 {
 	char path[PATH_MAX + 1];
 	int ret;
@@ -92,7 +92,7 @@ _citrus_LC_COLLATE_create_impl(const cha
 }
 
 static __inline void
-_PREFIX(update_global)(_CollateLocale *data)
+_PREFIX(update_global)(struct xlocale_collate *data)
 {
 	_DIAGASSERT(data != NULL);
 }

Index: src/lib/libc/locale/collate_local.h
diff -u src/lib/libc/locale/collate_local.h:1.1.2.1 src/lib/libc/locale/collate_local.h:1.1.2.2
--- src/lib/libc/locale/collate_local.h:1.1.2.1	Fri Jul 14 15:53:08 2017
+++ src/lib/libc/locale/collate_local.h	Mon Jul 31 04:29:50 2017
@@ -1,26 +1,45 @@
-/*	$NetBSD: collate_local.h,v 1.1.2.1 2017/07/14 15:53:08 perseant Exp $	*/
+/*	$NetBSD: collate_local.h,v 1.1.2.2 2017/07/31 04:29:50 perseant Exp $	*/
 
 #ifndef	_COLLATE_LOCAL_H_
 #define	_COLLATE_LOCAL_H_
 
 #include <locale.h>
+#include "collate.h"
 
 #include "unicode_ucd.h"
 
 typedef struct _CollateLocale {
 	void		*coll_variable;
 	size_t		 coll_variable_len;
-	struct ucd_coll *coll_data;
+	struct ucd_coll        *coll_data; /* XXX obsolescent */
 	size_t           coll_data_len;
+	const struct _FileCollateLocale *coll_fcl;
+#define coll_collinfo  coll_fcl->fcl_collinfo
+#define coll_char_data coll_fcl->fcl_char_data
+	const collate_subst_t  *coll_subst;
+	const collate_chain_t  *coll_chains;
+	const collate_large_t  *coll_large;
 } _CollateLocale;
 
+typedef struct _FileCollateLocale {
+	collate_info_t  fcl_collinfo;
+	collate_char_t  fcl_char_data[0x100];
+/*
+        These fields are variable length (perhaps 0)
+	and follow the previous fields in the file:
+	collate_chain_t *chains;
+	collate_large_t *large;
+	collate_subst_t *subst;
+*/
+} _FileCollateLocale;
+
 /*
  * global variables
  */
-extern __dso_hidden const _CollateLocale _DefaultCollateLocale;
+extern __dso_hidden const struct xlocale_collate _DefaultCollateLocale;
 
 __BEGIN_DECLS
-int _collate_load(const char * __restrict, size_t, _CollateLocale ** __restrict);
+int _collate_load(const char * __restrict, size_t, struct xlocale_collate ** __restrict);
 __END_DECLS
 
 #endif	/* !_COLLATE_LOCAL_H_ */
Index: src/lib/libc/locale/collate_locale.c
diff -u src/lib/libc/locale/collate_locale.c:1.1.2.1 src/lib/libc/locale/collate_locale.c:1.1.2.2
--- src/lib/libc/locale/collate_locale.c:1.1.2.1	Fri Jul 14 15:53:08 2017
+++ src/lib/libc/locale/collate_locale.c	Mon Jul 31 04:29:50 2017
@@ -1,4 +1,4 @@
-/*	$NetBSD: collate_locale.c,v 1.1.2.1 2017/07/14 15:53:08 perseant Exp $	*/
+/*	$NetBSD: collate_locale.c,v 1.1.2.2 2017/07/31 04:29:50 perseant Exp $	*/
 /*-
  * Copyright (c)2010 Citrus Project,
  * All rights reserved.
@@ -25,53 +25,152 @@
  * SUCH DAMAGE.
  */
 
+#ifdef COLLATION_TEST
+#include "collation_test.h"
+#endif
+
 #define __SETLOCALE_SOURCE__
 #include <assert.h>
 #include <errno.h>
 #include <locale.h>
-#include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <wchar.h>
 
+#include "collate.h"
+
 #include "setlocale_local.h"
 #include "collate_local.h"
-#include "unicode_collation_data.h"
+#include "ducet_collation_data.h"
 
+#ifndef COLLATION_TEST
 #include "citrus_module.h"
+#endif
+
+const struct collate_info _DefaultCollateInfo = {
+	4,     /* uint8_t directive_count;               */
+	{},    /* uint8_t directive[COLL_WEIGHTS_MAX];   */
+	{},    /* int32_t pri_count[COLL_WEIGHTS_MAX];   */
+	0,     /* int32_t flags;                         */
+	DUCET_COLLATE_CHAINS_LENGTH, /* int32_t chain_count;       */
+	DUCET_COLLATE_LARGE_LENGTH, /* int32_t large_count;       */
+	{ 0 }, /* int32_t subst_count[COLL_WEIGHTS_MAX]; */
+	{ 0 }, /* int32_t undef_pri[COLL_WEIGHTS_MAX];   */
+	DUCET_COLLATE_RCHAINS_LENGTH, /* int32_t rchain_count;       */
+	DUCET_COLLATE_DCHAINS_LENGTH, /* int32_t rchain_count;       */
+};
 
-const _CollateLocale _DefaultCollateLocale = {
-	__UNCONST("DUCET"),
-	5,
-	&ucd_collate_data[0],
-	UCD_COLLATE_DATA_LENGTH
+const struct xlocale_collate _DefaultCollateLocale = {
+	0,    /* int __collate_load_error; */
+	NULL, /* char * map; */
+	0,    /* size_t maplen; */
+
+	&_DefaultCollateInfo, /* collate_info_t	*info; */
+	ducet_collate_chars, /* collate_char_t	*char_pri_table; */
+	ducet_collate_large, /* collate_large_t	*large_pri_table; */
+	ducet_collate_chains, /* collate_chain_t	*chain_pri_table; */
+	ducet_collate_rchains, /* collate_rchain_t	*rchain_pri_table; */
+	ducet_collate_dchains, /* collate_dchain_t	*dchain_pri_table; */
+	NULL, /* collate_subst_t	*subst_table[COLL_WEIGHTS_MAX]; */
 };
 
+static int
+_collate_read_file(const char * __restrict, size_t,
+		   struct xlocale_collate ** __restrict);
+
 int
 _collate_load(const char * __restrict var, size_t lenvar,
-    _CollateLocale ** __restrict prl)
+	      struct xlocale_collate ** __restrict prl)
 {
-	int ret;
-
 	_DIAGASSERT(var != NULL || lenvar < 1);
 	_DIAGASSERT(prl != NULL);
 
-	if (lenvar < 1)
+	if (lenvar < COLLATE_STR_LEN)
 		return EFTYPE;
-	switch (*var) {
-	case 'U':
-#ifdef notyet
-		ret = _collate_read_file(var, lenvar, prl);
-#else
-		*prl = (_CollateLocale *)malloc(sizeof(**prl));
-		(*prl)->coll_variable = __UNCONST("FAKE");
-		(*prl)->coll_variable_len = 4;
-		(*prl)->coll_data = (struct ucd_coll *)malloc(sizeof(struct ucd_coll));
-		(*prl)->coll_data_len = 0;
-		ret = 0;
-#endif
-		break;
-	default:
-		ret = EFTYPE;
+
+	if (strncmp(var, COLLATE_VERSION, COLLATE_STR_LEN))
+		return EFTYPE;
+
+	var += COLLATE_STR_LEN;
+	lenvar -= COLLATE_STR_LEN;
+
+	return _collate_read_file(var, lenvar, prl);
 	}
-	return ret;
+
+static int
+_collate_read_file(const char * __restrict var, size_t lenvar,
+		   struct xlocale_collate ** __restrict prl)
+{
+	struct xlocale_collate *clp;
+	size_t section_length;
+	char *ci;
+
+	if (lenvar < sizeof(struct collate_info))
+		return EFTYPE;
+
+	clp = (struct xlocale_collate *)malloc(sizeof(*clp));
+	ci = (char *)malloc(lenvar);
+	memcpy(ci, var, lenvar);
+
+	/* File header */
+	clp->info = (const struct collate_info *)ci;
+	ci += sizeof(*clp->info);
+	lenvar -= sizeof(*clp->info);
+
+	/* Table of narrow character priorities */
+	clp->char_pri_table = (const collate_char_t *)ci;
+	section_length = 0x80 * sizeof(collate_chain_t);
+	if (lenvar < section_length)
+		goto errout;
+	ci += section_length;
+	lenvar -= section_length;
+
+	/* Collation elements ("chains") */
+	clp->chain_pri_table = (const collate_chain_t *)ci;
+	section_length = clp->info->chain_count * sizeof(collate_chain_t);
+	if (lenvar < section_length)
+		goto errout;
+	ci += section_length;
+	lenvar -= section_length;
+
+	/* Collation weights for characters > 0x80 */
+	clp->large_pri_table = (const collate_large_t *)ci;
+	section_length = clp->info->large_count * sizeof(collate_large_t);
+	if (lenvar < section_length)
+		goto errout;
+	ci += section_length;
+	lenvar -= section_length;
+
+#if 0
+	/* Substitutions */
+	clp->subst_table = (collate_subst_t *)ci;
+	section_length = clp->info->subst_count * sizeof(collate_subst_t);
+	if (lenvar < section_length)
+		goto errout;
+	ci += section_length;
+	lenvar -= section_length;
+#endif
+
+	/* Characters that have more than one associated weight (reverse chains) */
+	clp->rchain_pri_table = (const collate_rchain_t *)ci;
+	section_length = clp->info->rchain_count * sizeof(collate_rchain_t);
+	if (lenvar < section_length)
+		goto errout;
+	ci += section_length;
+	lenvar -= section_length;
+
+	/* Double chains (>1 char to >1 weight mapping) */
+	clp->dchain_pri_table = (const collate_dchain_t *)ci;
+	section_length = clp->info->dchain_count * sizeof(collate_dchain_t);
+	if (lenvar < section_length)
+		goto errout;
+	ci += section_length;
+	lenvar -= section_length;
+
+	*prl = clp;
+	return 0;
+
+errout:
+	free(clp);
+	return EFTYPE;
 }
Index: src/lib/libc/locale/unicode_collate.c
diff -u src/lib/libc/locale/unicode_collate.c:1.1.2.1 src/lib/libc/locale/unicode_collate.c:1.1.2.2
--- src/lib/libc/locale/unicode_collate.c:1.1.2.1	Fri Jul 14 15:53:08 2017
+++ src/lib/libc/locale/unicode_collate.c	Mon Jul 31 04:29:50 2017
@@ -27,7 +27,12 @@
  * SUCH DAMAGE.
  */
 
+#ifdef COLLATION_TEST
+#include "collation_test.h"
+#endif /* COLLATION_TEST */
+
 #include <sys/queue.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <locale.h>
 #include <string.h>
@@ -50,7 +55,7 @@
 int
 wcscoll_l(const wchar_t *s1, const wchar_t *s2, locale_t loc)
 {
-	if (loc->part_impl[LC_COLLATE] == _lc_C_locale.part_impl[LC_COLLATE])
+	if (loc && loc->part_impl[LC_COLLATE] == _lc_C_locale.part_impl[LC_COLLATE])
 		return wcscmp(s1, s2);
 	return unicode_wcscoll_l(s1, s2, loc);
 }
@@ -120,19 +125,21 @@ unicode_wcstokey(uint16_t **key, size_t 
 	*keylen = clen0;
 
 #ifdef DEBUG_FINAL_WEIGHTS
-	fprintf(stderr, " final weights:");
-	for (ret = 0; (size_t)ret < *keylen; ++ret) {
-		if ((*key)[ret] == 0x0)
-			fprintf(stderr, " |");
+	printf(" final weights:");
+	for (clen0 = 0; (size_t)clen0 < *keylen; ++clen0) {
+		if ((*key)[clen0] == 0x0)
+			printf(" |");
 		else
-			fprintf(stderr, " %4.4hx", (*key)[ret]);
+			printf(" %4.4hx", (*key)[clen0]);
 	}
-	fprintf(stderr, "\n");
+	printf("\n");
 #endif
 
 	/* Free the tailq */
 	TAILQ_FOREACH_SAFE(entry, &c_arg0, tailq, tentry) {
 		TAILQ_REMOVE(&c_arg0, entry, tailq);
+		if (entry->flags & CE_FLAGS_NEEDSFREE)
+			free(__UNCONST(entry->pri));
 		free(entry);
 	}
 	TAILQ_INIT(&c_arg0);
@@ -265,7 +272,6 @@ unicode_wcscoll_l(const wchar_t *arg0, c
 	 *    of the original strings.
 	 */
 	if (ret == 0) {
-		fprintf(stderr, " tie breaker: ");
 		fprint_unicode_string(stderr, (m_arg0 ? m_arg0 : arg0));
 		fprintf(stderr, " vs ");
 		fprint_unicode_string(stderr, (m_arg1 ? m_arg1 : arg1));
@@ -286,7 +292,7 @@ unicode_wcscoll_l(const wchar_t *arg0, c
 static wchar_t *
 unicode_recursive_decompose(const wchar_t *arg0, wchar_t **m_arg0)
 {
-	wchar_t *wcp, *dst, *decomposition, *targ0;
+	wchar_t *wcp, *dst, *decomposition, *targ0, *dend;
 	int recurse = 0;
 	int swap_again = 0;
 	int max_factor = 1, n = 0;
@@ -330,16 +336,21 @@ unicode_recursive_decompose(const wchar_
 		wcscpy(dst, targ0);
 	} else {
 		for (wcp = targ0, dst = *m_arg0; *wcp; ++wcp) {
-			if (unicode_get_nfd_qc(*wcp) > 0)
+			if (*wcp < 0x80)
+				decomposition = NULL;
+			if (unicode_get_nfd_qc(*wcp) > 0) {
 				decomposition = NULL;
-			else
+			} else
 				decomposition = unicode_decomposition(*wcp, &n);
 			if (decomposition == NULL) {
 				*dst++ = *wcp;
 			} else {
-				while (*decomposition) {
-					int n2 = 0;
-					if (unicode_decomposition(*decomposition, &n2) != NULL) {
+				dend = decomposition + n;
+				while (decomposition < dend) {
+					/* int n2 = 0; */
+					if (*decomposition >= 0x80
+					    && unicode_get_nfd_qc(*decomposition) == 0
+					    /* && unicode_decomposition(*decomposition, &n2) != NULL */) {
 						recurse = 1;
 					}
 					*dst++ = *decomposition++;
@@ -396,30 +407,34 @@ static uint16_t *
 unicode_coll2numkey(struct collation_set *cc, int *lenp, locale_t loc)
 {
 	struct collation_element *ccp;
-	weight_t val;
+	int val;
 	uint16_t *ret, *rp;
 	int max_level = COLLATE_MAX_LEVEL(loc) - 1; /* 2 or 3 */
-	int shift, count;
+	int i, j, count;
 
 	/* Count the weights */
 	count = 0;
 	TAILQ_FOREACH(ccp, cc, tailq) {
-		count++;
+		count += ccp->len;
 	}
 
-	rp = ret = (uint16_t *)calloc((count + 1) * (max_level + 1), sizeof(*rp));
-	for (shift = 16 * max_level; shift >= 0; shift -= 16) {
-		if (COLLATE_FLAGS(loc, level) & COLL_BACKWARDS) {
+	rp = ret = (uint16_t *)calloc(2 * (count + 1) * (max_level + 1), sizeof(*rp));
+	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
+		if (loc && _COLLATE_LOCALE(loc)->info->directive[i] & COLL_BACKWARDS) {
 			TAILQ_FOREACH_REVERSE(ccp, cc, collation_set, tailq) {
-				val = (ccp->weight >> shift) & 0xffff;
-				if (val != 0) {
+			  	for (j = ccp->len - 1; j >= 0; --j) {
+					val = ccp->pri[j * COLL_WEIGHTS_MAX + i];
+					if (val != 0)
 					*rp++ = val;
 				}
 			}
 		} else {
 			TAILQ_FOREACH(ccp, cc, tailq) {
-				val = (ccp->weight >> shift) & 0xffff;
-				if (val != 0) {
+				if (ccp->pri == NULL)
+					continue;
+				for (j = 0; j < ccp->len; j++) {
+					val = ccp->pri[j * COLL_WEIGHTS_MAX + i];
+					if (val != 0)
 					*rp++ = val;
 				}
 			}
@@ -427,6 +442,7 @@ unicode_coll2numkey(struct collation_set
 		/* Between each level we place a zero weight marker. */
 		*rp++ = 0;
 	}
+
 	*lenp = rp - ret;
 	return ret;
 }
Index: src/lib/libc/locale/unicode_nfd_qc_data.h
diff -u src/lib/libc/locale/unicode_nfd_qc_data.h:1.1.2.1 src/lib/libc/locale/unicode_nfd_qc_data.h:1.1.2.2
--- src/lib/libc/locale/unicode_nfd_qc_data.h:1.1.2.1	Fri Jul 14 15:53:08 2017
+++ src/lib/libc/locale/unicode_nfd_qc_data.h	Mon Jul 31 04:29:50 2017
@@ -1,3 +1,5 @@
+#define UCD_NFD_QC_DATA_LENGTH 13232
+
 static struct ucd_nfd_qc ucd_nfd_qc_data[] =  {
  { 0x000C0, 0x0 },
  { 0x000C1, 0x0 },
Index: src/lib/libc/locale/unicode_ucd.c
diff -u src/lib/libc/locale/unicode_ucd.c:1.1.2.1 src/lib/libc/locale/unicode_ucd.c:1.1.2.2
--- src/lib/libc/locale/unicode_ucd.c:1.1.2.1	Fri Jul 14 15:53:08 2017
+++ src/lib/libc/locale/unicode_ucd.c	Mon Jul 31 04:29:50 2017
@@ -27,13 +27,19 @@
  * SUCH DAMAGE.
  */
 
+#ifdef COLLATION_TEST
+#include "collation_test.h"
+#include <stdio.h>
+#endif
+
 #include <assert.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <locale.h>
 #include <wchar.h>
 #include "unicode_ucd.h"
-#include "unicode_decomp_data.h"
+#include "unicode_dm_data.h"
+/* #include "unicode_decomp_data.h" */
 /* #include "unicode_collation_data.h" - DUCET */
 #include "unicode_ccc_data.h"
 #include "unicode_nfd_qc_data.h"
@@ -43,31 +49,22 @@
 #include "setlocale_local.h"
 #include "collate_local.h"
 
-static const struct ucd_coll * ucd_get_collation_element(wchar_t *, const _CollateLocale *, int *, int);
+static int ucd_get_collation_element(wchar_t *, const struct xlocale_collate *, struct collation_element *, int);
 
-#define USE_NFD_QC_HASH_TABLE
+/* #define USE_NFD_QC_HASH_TABLE */
 #define USE_CCC_HASH_TABLE
-#define USE_DECOMP_HASH_TABLE
 /* #define USE_COLL_HASH_TABLE */
 
+#ifdef USE_NFD_QC_HASH_TABLE
 #define NFD_QC_WIDTH 1024
 #define NFD_QC_HASH(x) (((x) >> 2) & 0x3ff)
 static struct ucd_nfd_qc_head *nfd_qc_hash;
+#endif
 
 #define CCC_WIDTH 1024
 #define CCC_HASH(x) (((x) >> 2) & 0x3ff)
 static struct ucd_ccc_head *ccc_hash;
 
-#define DECOMP_WIDTH 1024
-#define DECOMP_HASH(x) (((x) >> 2) & 0x3ff)
-static struct ucd_decomp_head *decomp_hash;
-
-#ifdef USE_COLL_HASH_TABLE
-#define COLL_WIDTH 1024
-#define COLL_HASH(x) (((x) >> 2) & 0x3ff)
-static struct ucd_coll_head *coll_hash;
-#endif
-
 #define RESV_CP_WIDTH 1024
 #define RESV_CP_HASH(x) (((x) >> 2) & 0x3ff)
 static struct ucd_resv_cp_head *resv_cp_hash;
@@ -77,8 +74,8 @@ static struct ucd_resv_range_head resv_r
 uint8_t unicode_get_nfd_qc(wchar_t wc)
 {
 	int dfl = 1;
-	struct ucd_nfd_qc *ccp;
 #ifdef USE_NFD_QC_HASH_TABLE
+	struct ucd_nfd_qc *ccp;
 	int i;
 
 	if (nfd_qc_hash == NULL) {
@@ -99,13 +96,22 @@ uint8_t unicode_get_nfd_qc(wchar_t wc)
 		}
 	}
 #else
-	for (ccp = ucd_nfd_qc_data; ccp->cp; ++ccp) {
-		if (ccp->cp > wc) {
-			return dfl;
-		}
-		if (ccp->cp == wc) {
-			return ccp->nfd_qc;
-		}
+	int l, r, m;
+	wchar_t u;
+
+	/* Try single substitutions first */
+	l = 0;
+	r = UCD_NFD_QC_DATA_LENGTH - 1;
+	while (l <= r) {
+		m = (l + r) / 2;
+		u = ucd_nfd_qc_data[m].cp;
+		if (u == wc) {
+			return ucd_nfd_qc_data[m].nfd_qc;
+		}
+		if (u < wc)
+			l = m + 1;
+		else
+			r = m - 1;
 	}
 #endif /* USE_NFD_QC_HASH_TABLE */
 	return dfl;
@@ -127,9 +133,11 @@ uint8_t unicode_get_ccc(wchar_t wc)
 	}
 
 	SLIST_FOREACH(ccp, &ccc_hash[CCC_HASH(wc)], hash_next) {
+#if 0
 		if (ccp->cp < wc) {
 			return 0;
 		}
+#endif
 		if (ccp->cp == wc) {
 			return ccp->ccc;
 		}
@@ -153,133 +161,252 @@ uint8_t unicode_get_ccc(wchar_t wc)
  */
 wchar_t *unicode_decomposition(wchar_t wc, int *np)
 {
-	struct ucd_decomp *ccp;
-#ifdef USE_DECOMP_HASH_TABLE
-	int i;
+	int l, r, m;
+	wchar_t u;
 
-	if (decomp_hash == NULL) {
-		decomp_hash = (struct ucd_decomp_head *)calloc(DECOMP_WIDTH, sizeof(*decomp_hash));
-		for (i = 0; i < DECOMP_WIDTH; i++)
-			SLIST_INIT(decomp_hash + i);
-		for (ccp = ucd_decomp_data; ccp->cp; ++ccp) {
-			SLIST_INSERT_HEAD(&decomp_hash[DECOMP_HASH(ccp->cp)], ccp, hash_next);
-		}
-	}
-
-	SLIST_FOREACH(ccp, &decomp_hash[DECOMP_HASH(wc)], hash_next) {
-		if (ccp->cp < wc) {
-			return NULL;
-		}
-		if (ccp->cp == wc) {
-			*np = ccp->n;
-			return (wchar_t *)ccp->dm;
-		}
+	/* Try single substitutions first */
+	l = 0;
+	r = UCD_DECOMP_SINGLE_DATA_LENGTH - 1;
+	while (l <= r) {
+		m = (l + r) / 2;
+		u = ucd_decomp_single_data[m].cp;
+		if (u == wc) {
+			*np = 1;
+			return &ucd_decomp_single_data[m].dm;
+		}
+		if (u < wc)
+			l = m + 1;
+		else
+			r = m - 1;
 	}
-#else /* ! USE_DECOMP_HASH_TABLE */
-	for (ccp = ucd_decomp_data; ccp->cp; ++ccp) {
-		if (ccp->cp > wc)
-			return NULL;
-		if (ccp->cp == wc) {
-			*np = ccp->n;
-			return (wchar_t *)ccp->dm;
+	
+	/* Then pairs */
+	l = 0;
+	r = UCD_DECOMP_PAIR_DATA_LENGTH - 1;
+	while (l <= r) {
+		m = (l + r) / 2;
+		u = ucd_decomp_pair_data[m].cp;
+		if (u == wc) {
+			*np = 2;
+			return ucd_decomp_pair_data[m].dm;
+		}
+		if (u < wc)
+			l = m + 1;
+		else
+			r = m - 1;
+	}
+
+	/* Then the rest */
+	l = 0;
+	r = UCD_DECOMP_MISC_DATA_LENGTH - 1;
+	while (l <= r) {
+		m = (l + r) / 2;
+		u = ucd_decomp_misc_data[m].cp;
+		if (u == wc) {
+			*np = ucd_decomp_misc_data[m].n;
+			return ucd_decomp_misc_data[m].dm;
+		}
+		if (u < wc)
+			l = m + 1;
+		else
+			r = m - 1;
 		}
-	}
-#endif /* ! USE_DECOMP_HASH_TABLE */
 	return NULL;
-}
+	}
 
-static const struct ucd_coll *
-ucd_get_collation_element(wchar_t *wcp, const _CollateLocale *cloc, int *np, int do_ccc)
+int
+ucd_get_collation_element(wchar_t *wcp, const struct xlocale_collate *xc, struct collation_element *out, int do_ccc)
 {
-	const struct ucd_coll *cc, *best_cc = NULL, *sc;
-#ifdef USE_COLL_HASH_TABLE
-	struct ucd_coll *ccp;
-#endif
-	wchar_t *wcccp, tmp;
-	int best_len = 0;
-	int i, match;
+	struct collation_element sc;
+	wchar_t *wcccp, tmp, u;
+	size_t i, j, sclen, best = 0;
 	uint8_t ccc, this_ccc;
+	int m, l, r;
 
-#ifdef USE_COLL_HASH_TABLE
-	if (coll_hash == NULL) {
-		coll_hash = (struct ucd_coll_head *)calloc(COLL_WIDTH, sizeof(*coll_hash));
-		for (i = 0; i < COLL_WIDTH; i++)
-			SLIST_INIT(coll_hash + i);
-		for (ccp = ucd_collate_data; ccp->cp < 0x0FFFFFFF; ++ccp) {
-			wchar_t wc = ccp->cp;
-			if (wc == 0x0)
-				wc = ccp->cpp[0];
-			SLIST_INSERT_HEAD(&coll_hash[COLL_HASH(wc)], ccp, hash_next);
+	sc.flags = 0x0;
+	out->pri = NULL;
+	out->len = 1;
+
+	if (xc != NULL) {
+		/* Find the index of the leading character, if it appears */
+		l = m = 0;
+		r = xc->info->chain_count - 1;
+		while (l <= r) {
+			m = (l + r) / 2;
+			u = xc->chain_pri_table[m].str[0];
+			if (u == wcp[0]) {
+				break;
+		}
+			if (u < wcp[0])
+				l = m + 1;
+			else
+				r = m - 1;
+		}
+
+		/* Find the first element that begins with wcp[0] */
+		while (wcp[0] == xc->chain_pri_table[m].str[0])
+			--m;
+		++m;
+
+		if (wcp[0] == xc->chain_pri_table[m].str[0]) {
+			const wchar_t * pm = xc->chain_pri_table[m].str;
+			/* There is at least one matching chain. */
+			/* Walk through all the possibilities. */
+			while (wcp[0] == pm[0]) {
+				pm = xc->chain_pri_table[m].str;
+				/* Check that this element matches */
+				for (i = 0; pm[i]; i++) {
+					if (wcp[i] != pm[i])
+						break;
+	}
+				if (pm[i] == 0) {
+					/* It matches */
+					if (i > best) {
+						out->pri = xc->chain_pri_table[m].pri;
+						best = i;
 		}
 	}
+				++m;
+			}
+}
 
-	SLIST_FOREACH(cc, &coll_hash[COLL_HASH(*wcp)], hash_next) {
-#else
-	for (cc = cloc->coll_data; cc->cp < 0x0FFFFFFF; ++cc) {
-		if (cc->cp > wcp[0]) {
+		/* Check double chains. */
+		if (best == 0) {
+			l = 0;
+			r = xc->info->dchain_count - 1;
+			while (l <= r) {
+				m = (l + r) / 2;
+				u = xc->dchain_pri_table[m].str[0];
+				if (u == wcp[0]) {
+					break;
+		}
+				if (u < wcp[0])
+					l = m + 1;
+				else
+					r = m - 1;
+	}
+			/* Find the first element that begins with wcp[0] */
+			while (wcp[0] == xc->dchain_pri_table[m].str[0])
+				--m;
+			++m;
+
+			if (wcp[0] == xc->dchain_pri_table[m].str[0]) {
+				const wchar_t * pm = xc->dchain_pri_table[m].str;
+				/* There is at least one matching chain. */
+				/* Walk through all the possibilities. */
+				while (wcp[0] == pm[0]) {
+					pm = xc->dchain_pri_table[m].str;
+					/* Check that this element matches */
+					for (i = 0; pm[i]; i++) {
+						if (wcp[i] != pm[i])
 			break;
 		}
-#endif
+					if (pm[i] == 0) {
+						/* It matches */
+						if (i > best) {
+							out->pri = (weight_t *)xc->dchain_pri_table[m].pri;
+							for (j = 0; j < COLLATE_STR_LEN && xc->dchain_pri_table[m].pri[j][0] != (weight_t)0xFFFFFFFF; ++j)
+								;
+							out->len = j;
+							best = i;
+						}
+					}
+					++m;
+				}
+			}
+		}
 
-		/*
-		 * Multi-character matches
-		 */
-		if (cc->cp == 0x0) {
-			match = 1;
-			for (i = 0; cc->cpp[i] != 0x0; ++i) {
-				if (cc->cpp[i] != wcp[i]) {
-					match = 0;
+		if (best == 0) {
+			/* There is no matching chain. Look up individual weights. */
+			if (wcp[0] < 0x80) {
+				out->pri = xc->char_pri_table[wcp[0]].pri;
+				best = 1;
+			} else {
+				/* Now look at large matches */
+				l = 0;
+				r = xc->info->large_count;
+				while (l <= r) {
+					m = (l + r) / 2;
+					u = xc->large_pri_table[m].val;
+					if (u == wcp[0]) {
+						out->pri = xc->large_pri_table[m].pri;
+						best = 1;
 					break;
 				}
+					if (u < wcp[0])
+						l = m + 1;
+					else
+						r = m - 1;
 			}
-			if (match && i > best_len) {
-				best_cc = cc;
-				best_len = i;
 			}
-		} else if (cc->cp == wcp[0] && best_len == 0) {
-			/* Single-character match */
-			best_cc = cc;
-			best_len = 1;
+		}
+
+		/* Check reverse chains. */
+		if (best == 0) {
+			l = 0;
+			r = xc->info->rchain_count;
+			while (l <= r) {
+				m = (l + r) / 2;
+				u = xc->rchain_pri_table[m].val;
+				if (u == wcp[0]) {
+					out->pri = (weight_t *)xc->rchain_pri_table[m].pri;
+					for (i = 0; i < COLLATE_STR_LEN && xc->rchain_pri_table[m].pri[i][0] != (weight_t)0xFFFFFFFF; ++i)
+						;
+					out->len = i;
+					best = 1;
+					break;
+				}
+				if (u < wcp[0])
+					l = m + 1;
+				else
+					r = m - 1;
 		}
 	}
 
-	if (best_cc != NULL && do_ccc) {
+		if (best > 0 && do_ccc) {
 		/*
-		 * We have found the longest string S that matches in the table.
-		 * Now let the set of unblocked non-starters immediately following S
-		 * be called CC.  For any C in CC, if S+C has a match in the table,
-		 * replace S with S+C and remove C from CC.  Repeat as long as we
-		 * are expanding S.  Then copy the collation weights.
+			 * We have found the longest string S that matches in
+			 * the table.  Now let the set of unblocked
+			 * non-starters immediately following S be called CC.
+			 * For any C in CC, if S+C has a match in the table,
+			 * replace S with S+C and remove C from CC.  Repeat as
+			 * long as we are expanding S.  Then copy the
+			 * collation weights.
 		 */
 		ccc = 0xff;
-		for (wcccp = wcp + best_len; *wcccp; ++wcccp) {
+			for (wcccp = wcp + best; *wcccp; ++wcccp) {
 			this_ccc = unicode_get_ccc(*wcccp);
-			if (this_ccc == 0 || this_ccc > ccc)
+				if (this_ccc == 0)
 				break;
-			/* Swap this and wcp + best_len, and look for a match */
+				if (this_ccc == ccc)
+					continue;
+				ccc = this_ccc;
+
+				/* Swap this and wcp + best, and look for a match */
 			tmp = *wcccp;
-			*wcccp = *(wcp + best_len);
-			*(wcp + best_len) = tmp;
-			sc = ucd_get_collation_element(wcp, cloc, &best_len, 0);
-			if (sc != best_cc) {
-				best_cc = sc;
+				*wcccp = *(wcp + best);
+				*(wcp + best) = tmp;
+				sclen = ucd_get_collation_element(wcp, xc, &sc, 0);
+				if (sclen > best) {
+					*out = sc; /* Structure copy */
+					best = sclen;
 				/* Back up and start over */
-				wcccp = wcp + best_len;
+					wcccp = wcp + best;
 			} else {
-				/* swap back - best_len should be unchanged */
+					/* swap back */
 				tmp = *wcccp;
-				*wcccp = *(wcp + best_len);
-				*(wcp + best_len) = tmp;
+					*wcccp = *(wcp + best);
+					*(wcp + best) = tmp;
+				}
 			}
 		}
 	}
 
 	/* Fall back to DUCET if we don't find anything in the locale */
-	if (best_len == 0 && cloc != &_DefaultCollateLocale)
-		return ucd_get_collation_element(wcp, &_DefaultCollateLocale, np, do_ccc);
+	if (best == 0 && xc != &_DefaultCollateLocale)
+		return ucd_get_collation_element(wcp, &_DefaultCollateLocale, out, do_ccc);
 
-	*np = best_len;
-	return best_cc;
+	return best;
 }
 
 #define UNASSIGNED_CACHE_WIDTH 16
@@ -402,9 +529,6 @@ ucd_check_unassigned(wchar_t wc)
 	return unassigned;
 }
 
-#define _COLLATE_LOCALE(loc) \
-  ((_CollateLocale *)((loc)->part_impl[(size_t)LC_COLLATE]))
-
 /*
  * Add the collation element sequence that corresponds with
  * the beginning of the given string to head.
@@ -412,20 +536,27 @@ ucd_check_unassigned(wchar_t wc)
  */
 int unicode_get_collation_element(struct collation_set *head, wchar_t *wcp, locale_t loc)
 {
-	const struct ucd_coll *cc;
-	const weight_t *ce;
-	struct collation_element *new_element;
-	struct ucd_coll lcc;
-	int n, unassigned;
-	_CollateLocale *cloc = _COLLATE_LOCALE(loc);
-
-	cc = ucd_get_collation_element(wcp, cloc, &n, 1);
-
-	if (cc == NULL) {
+	struct collation_element *cc;
+	int n, unassigned, len;
+	struct xlocale_collate *cloc;
+	weight_t *wp;
+	weight_t aaaa = 0x0, bbbb = 0x0;
+
+	cc = (struct collation_element *)malloc(sizeof(*cc));
+	cc->flags = 0x0;
+
+	if (loc == NULL)
+		cloc = NULL;
+	else
+		cloc = _COLLATE_LOCALE(loc);
+	len = ucd_get_collation_element(wcp, cloc, cc, 1);
+
+	if (len)
+		n = len;
+	else {
 		unassigned = ucd_check_unassigned(*wcp);
 
 		/* If we couldn't find it, supply implicit weights. */
-		weight_t aaaa = 0x0, bbbb = 0x0;
 		if (!unassigned) {
 			if ((*wcp >= 0x17000 && *wcp <= 0x187FF) /* Tangut block */
 			    || (*wcp >= 0x18800 && *wcp <= 0x18AFF)) /* Tangut_Components block */
@@ -467,18 +598,18 @@ int unicode_get_collation_element(struct
 			bbbb = (*wcp & 0x7FFF) | 0x8000;
 		}
 
-		lcc.ce[0] = (aaaa << 32) | 0x00200002;
-		lcc.ce[1] = (bbbb << 32) | 0x0;
-		lcc.ce[2] = 0x0;
-		cc = &lcc;
+		wp = (weight_t *)calloc(2 * COLL_WEIGHTS_MAX, sizeof(weight_t));
+		wp[0] = aaaa;
+		wp[1] = 0x0020;
+		wp[2] = 0x0002;
+		wp[1 * COLL_WEIGHTS_MAX + 0] = bbbb;
+		cc->pri = (const weight_t *)wp;
+		cc->len = 2;
+		cc->flags |= CE_FLAGS_NEEDSFREE;
 		n = 1;
 	}
 
-	for (ce = cc->ce; *ce; ++ce) {
-		new_element = (struct collation_element *)malloc(sizeof(*new_element));
-		new_element->weight = *ce;
-		TAILQ_INSERT_TAIL(head, new_element, tailq);
-	}
+	TAILQ_INSERT_TAIL(head, cc, tailq);
 
 	return n;
 }
Index: src/lib/libc/locale/unicode_ucd.h
diff -u src/lib/libc/locale/unicode_ucd.h:1.1.2.1 src/lib/libc/locale/unicode_ucd.h:1.1.2.2
--- src/lib/libc/locale/unicode_ucd.h:1.1.2.1	Fri Jul 14 15:53:08 2017
+++ src/lib/libc/locale/unicode_ucd.h	Mon Jul 31 04:29:50 2017
@@ -30,28 +30,54 @@
 #ifndef _UNICODE_UCD_H
 #define _UNICODE_UCD_H
 
+#ifdef COLLATION_TEST
+#include "collation_test.h"
+#endif
+
 #include <sys/types.h>
 #include <sys/queue.h>
 #include <locale.h>
 
+#include "collate_local.h"
+#include "collate.h"
+
 typedef int32_t ucd_codepoint_t;
-typedef uint64_t weight_t;
 
 TAILQ_HEAD(collation_set, collation_element);
 SLIST_HEAD(ucd_nfd_qc_head, ucd_nfd_qc);
 SLIST_HEAD(ucd_ccc_head, ucd_ccc);
-SLIST_HEAD(ucd_decomp_head, ucd_decomp);
 SLIST_HEAD(ucd_coll_head, ucd_coll);
 SLIST_HEAD(ucd_resv_cp_head, ucd_resv_cp);
 SLIST_HEAD(ucd_resv_range_head, ucd_resv_range);
 
+#define _COLLATE_LOCALE(loc) \
+  ((struct xlocale_collate *)((loc)->part_impl[(size_t)LC_COLLATE]))
+
 /* Data definition */
+#if 0
 struct ucd_decomp {
 	ucd_codepoint_t cp;
 	int n;
 	ucd_codepoint_t dm[20]; /* At present, no more than 18 */
 	SLIST_ENTRY(ucd_decomp) hash_next;
 };
+#endif
+
+struct ucd_decomp_single {
+	ucd_codepoint_t cp;
+	ucd_codepoint_t dm;
+};
+
+struct ucd_decomp_pair {
+	ucd_codepoint_t cp;
+	ucd_codepoint_t dm[2];
+};
+
+struct ucd_decomp_misc {
+	ucd_codepoint_t cp;
+	int n;
+	ucd_codepoint_t dm[20]; /* At present, no more than 18 */
+};
 
 struct ucd_ccc {
 	ucd_codepoint_t cp;
@@ -65,13 +91,6 @@ struct ucd_nfd_qc {
 	SLIST_ENTRY(ucd_nfd_qc) hash_next;
 };
 
-struct ucd_coll {
-	ucd_codepoint_t cp;
-	ucd_codepoint_t cpp[5]; /* XXX */
-	weight_t ce[20]; /* XXX */
-	SLIST_ENTRY(ucd_coll) hash_next;
-};
-
 struct ucd_resv_cp {
 	ucd_codepoint_t cp;
 	SLIST_ENTRY(ucd_resv_cp) hash_next;
@@ -84,7 +103,10 @@ struct ucd_resv_range {
 };
 
 struct collation_element {
-	weight_t weight;
+	const weight_t *pri;
+	int len;
+#define CE_FLAGS_NEEDSFREE 0x00000001
+	u_int32_t flags;
 	TAILQ_ENTRY(collation_element) tailq;
 };
 

Reply via email to