Hello Sebastian,
The attached file introduce a new configuration parameter in the
virtuoso.ini config file. The parameter controls whether accented
UNICODE characters should be converted to their non-accented base
variants at the very beginning of free-text indexing or parsing a
free-text query string.
The parameter is named XAnyNormalization ; it should be placed in [I18N]
section of virtuoso.ini ; its value is an integer that is bitmask with
only 2 bits in use atm:
0 --- the default behavior, do not normalize anything, so "José" and
"Jose" are two distinct words.
1 --- Any pair of base char and combinig char (NSM, non-spacing
modifier) is replaced with a single combined char, so if character "é"
is written as a sequence of "base" character "e" and a unicode char U
+301 ("combining acute accent") then the pair will be replaced with
single U+00E9 ("latin small letter e acute").
2 --- Any combined char is converted to its (smallest known) base. So
"é" will lose its accent and become plain old ASCII "e".
3 = 1|2 --- Perform both conversions. As a result, pair of base char and
combinig char loses its second char and chars with accents will lose
accents.
If the parameter is required at all, the needed value is probably 3. So
the fragment of virtuoso.ini is
[I18N]
XAnyNormalization=3
In some seldom case the value of 1 can be appropriate. The parameter
should be set once before creating the database. If changed on the
existing database, all free-text indexes that may contain non-ASCII data
should be re-created. On a typical system, the parameter affects all
text columns, XML columns, RDF literals and queries.
Strictly speaking, it affects not all of them but only items that use
default "x-any" language or language derived from x-any such as "en" and
"en-US" but if you haven't tried writing new C plugins for custom
languages you should not look so deep.
As an example, with
XAnyNormalization=3
once can get the following:
sparql insert in <http://InternationalNSMs/> { <s> <sp> "Índio João
Macapá Júnior Tôrres Luís Araújo José" ; <ru> "Он добавил картошки,
посолил и поставил аквариум на огонь" . }
Insert into <http://InternationalNSMs/>, 2 (or less) triples -- done
DB.DBA.RDF_OBJ_FT_RULE_ADD (null, null, 'InternationalNSMs.wb')
Done. -- 0 msec.
vt_inc_index_db_dba_rdf_obj()
Done. -- 26 msec.
sparql select * from <http://InternationalNSMs/> where { ?s ?p ?o }
order by asc (str(?o))
s sp Índio João Macapá Júnior Tôrres Luís Araújo José
s ru Он добавил картошки, посолил и поставил аквариум на огонь
2 Rows. -- 2 msec.
sparql select * from <http://InternationalNSMs/> where { ?s ?p ?o . ?o
bif:contains "'Índio João Macapá Júnior Tôrres Luís Araújo José'" . }
s sp Índio João Macapá Júnior Tôrres Luís Araújo José
1 Rows. -- 2 msec.
sparql select * from <http://InternationalNSMs/> where { ?s ?p ?o . ?o
bif:contains "'Indio Joao Macapa Junior Torres Luis Araujo Jose'" . }
s sp Índio João Macapá Júnior Tôrres Luís Araújo José
1 Rows. -- 1 msec.
sparql select * from <http://InternationalNSMs/> where { ?s ?p ?o . ?o
bif:contains "'поставил аквариум на огонь'" . }
s ru Он добавил картошки, посолил и поставил аквариум на огонь
There was also request for function that normalizes characters in
strings as free-text engine will do with XAnyNormalization=3 , the
function will be provided as a separate patch and depend on this
specific patch.
Best Regards,
Ivan Mikhailov
OpenLink Software
http://virtuoso.openlinksw.com
Index: binsrc/virtuoso/viconfig.c
===================================================================
RCS file: /home/staff/us-cvs/virtuoso/binsrc/virtuoso/viconfig.c,v
retrieving revision 1.228
diff -u -U 10 -r1.228 viconfig.c
--- binsrc/virtuoso/viconfig.c 7 Jan 2011 13:57:26 -0000 1.228
+++ binsrc/virtuoso/viconfig.c 13 Jan 2011 14:32:32 -0000
@@ -329,23 +329,25 @@
char *c_server_default_language_name = 0;
int32 c_http_threads = 0;
int32 c_http_max_keep_alives = 0;
int32 c_http_keep_alive_timeout = 0;
int32 c_http_max_cached_proxy_connections = 0;
int32 c_http_proxy_connection_cache_timeout = 0;
int32 c_http_thread_sz = 280000;
int32 c_http_keep_hosting = 0;
extern long http_keep_hosting; /* from http.c */
char *c_ucm_load_path = 0;
+int32 c_lh_xany_normalization_flags = 0;
int32 c_i18n_wide_file_names = 0;
char *c_i18n_volume_encoding = NULL;
char *c_i18n_volume_emergency_encoding = NULL;
+extern int lh_xany_normalization_flags;
extern int i18n_wide_file_names;
extern struct encoding_handler_s *i18n_volume_encoding;
extern struct encoding_handler_s *i18n_volume_emergency_encoding;
char *c_plugin_load_path = 0;
int32 c_http_ses_trap = 0;
int32 c_http_check_rdf_accept = 0;
int32 c_iri_cache_size = 0;
int32 c_lite_mode = 0;
int32 c_uriqa_dynamic_local = 0;
@@ -1485,20 +1487,22 @@
eh_load_handler (new_eh);
}
free (ucm_file);
free (ucm_names);
}
}
/* Initialization of national filesystems */
section = "I18N";
+ if (cfg_getlong (pconfig, section, "XAnyNormalization", &c_lh_xany_normalization_flags) == -1)
+ c_lh_xany_normalization_flags = 0;
if (cfg_getlong (pconfig, section, "WideFileNames", &c_i18n_wide_file_names) == -1)
c_i18n_wide_file_names = 0;
if (cfg_getstring (pconfig, section, "VolumeEncoding", &c_i18n_volume_encoding) == -1)
c_i18n_volume_encoding = NULL;
if (cfg_getstring (pconfig, section, "VolumeEmergencyEncoding", &c_i18n_volume_emergency_encoding) == -1)
c_i18n_volume_emergency_encoding = NULL;
/* Initialization of plugins */
@@ -1747,20 +1751,21 @@
}
else if (0 != (it_n_maps % 2))
{
it_n_maps = 2 * (it_n_maps / 2);
}
uriqa_dynamic_local = c_uriqa_dynamic_local;
sparql_result_set_max_rows = c_sparql_result_set_max_rows;
sparql_max_mem_in_use = c_sparql_max_mem_in_use;
cli_encryption_on_password = c_cli_encryption_on_password;
+ lh_xany_normalization_flags = c_lh_xany_normalization_flags;
i18n_wide_file_names = c_i18n_wide_file_names;
if (NULL != c_i18n_volume_encoding)
{
i18n_volume_encoding = eh_get_handler (c_i18n_volume_encoding);
if (NULL == i18n_volume_encoding)
{
log_error ("The value of VolumeEncoding parameter is not a valid encoding name");
return;
}
}
Index: libsrc/Wi/bif_intl.c
===================================================================
RCS file: /home/staff/us-cvs/virtuoso/libsrc/Wi/bif_intl.c,v
retrieving revision 1.60.2.12
diff -u -U 10 -r1.60.2.12 bif_intl.c
--- libsrc/Wi/bif_intl.c 17 Sep 2010 13:32:01 -0000 1.60.2.12
+++ libsrc/Wi/bif_intl.c 13 Jan 2011 14:52:20 -0000
@@ -31,20 +31,21 @@
#include <limits.h>
#include "wi.h"
#include "libutil.h"
#include "sqlnode.h"
#include "eqlcomp.h"
#include "sqlfn.h"
#include "sqlbif.h"
#include "multibyte.h"
#include "srvmultibyte.h"
#include "xml.h"
+#include "security.h"
#ifdef __cplusplus
extern "C" {
#endif
#include "xmlparser.h"
/*#include "xmlparser_impl.h"*/
#include "langfunc.h"
#ifdef __cplusplus
}
#endif
@@ -1022,20 +1023,27 @@
else if (!strcmp (enctype, "8-BIT"))
ASSERT_BOX_8BIT(box);
else if (!strcmp (enctype, "WCHAR"))
ASSERT_BOX_WCHAR(box);
else
sqlr_new_error ("22023", "SR533",
"Second argument of dbg_assert_encoding() must be one of 'UTF-8', '8-BIT', 'WCHAR', not '%.1000s'", enctype);
return box_copy_tree (box);
}
+static caddr_t
+bif_dbg_set_lh_xany_normalization_flags (caddr_t * qst, caddr_t * err_ret, state_slot_t ** args)
+{
+ sec_check_dba ((query_instance_t *)qst, "dbg_set_lh_xany_normalization_flags");
+ lh_xany_normalization_flags = bif_long_arg (qst, args, 0, "dbg_set_lh_xany_normalization_flags");
+}
+
wcharset_t *
wcharset_by_name_or_dflt (ccaddr_t cs_name, query_instance_t *qi)
{
wcharset_t * charset = NULL;
if (NULL != cs_name)
{
if (!stricmp (cs_name, "UTF-8"))
return CHARSET_UTF8;
if (!stricmp (cs_name, "_WIDE_"))
return CHARSET_WIDE;
@@ -1142,13 +1150,14 @@
bif_define ("__uname", bif_quick_uname);
bif_define ("charsets_list", bif_charsets_list);
bif_define_typed ("unicode_toupper", bif_unicode_toupper, &bt_integer);
bif_define_typed ("unicode_tolower", bif_unicode_tolower, &bt_integer);
bif_define ("unicode_char_properties", bif_unicode_char_properties);
bif_define_typed ("iswidestring", bif_iswidestring, &bt_integer);
#ifndef NDEBUG
bif_define ("set_utf8_output", bif_set_utf8_output);
#endif
bif_define ("dbg_assert_encoding", bif_dbg_assert_encoding);
+ bif_define ("__dbg_set_lh_xany_normalization_flags", bif_dbg_set_lh_xany_normalization_flags);
bif_define_typed ("langmatches_pct_http", bif_langmatches_pct_http, &bt_integer);
}
Index: libsrc/Wi/bif_text.c
===================================================================
RCS file: /home/staff/us-cvs/virtuoso/libsrc/Wi/bif_text.c,v
retrieving revision 1.145.2.28
diff -u -U 10 -r1.145.2.28 bif_text.c
--- libsrc/Wi/bif_text.c 22 Nov 2010 16:33:11 -0000 1.145.2.28
+++ libsrc/Wi/bif_text.c 12 Jan 2011 20:22:48 -0000
@@ -271,20 +271,21 @@
if (NULL == eh)
log_error ("Unsupported encoding \"%s\" used in noise.txt file, some strings may be ignored", name);
continue;
}
if (NULL == eh)
continue;
res = lh_iterate_patched_words (eh, lh, nw, tail - nw, lh->lh_is_vtb_word, NULL, noise_word_init_callback, (void *) (future_noise_words));
res |= lh_iterate_patched_words (eh, lh, nw, tail - nw, lh->lh_is_vtb_word, lh->lh_tocapital_word, noise_word_init_callback, (void *) (future_noise_words));
res |= lh_iterate_patched_words (eh, lh, nw, tail - nw, lh->lh_is_vtb_word, lh->lh_toupper_word, noise_word_init_callback, (void *) (future_noise_words));
res |= lh_iterate_patched_words (eh, lh, nw, tail - nw, lh->lh_is_vtb_word, lh->lh_tolower_word, noise_word_init_callback, (void *) (future_noise_words));
+ res |= lh_iterate_patched_words (eh, lh, nw, tail - nw, lh->lh_is_vtb_word, lh->lh_normalize_word, noise_word_init_callback, (void *) (future_noise_words));
if (res)
log_error ("Broken text in noise.txt file, (encoding \"%s\"): %s", eh->eh_names[0], nw);
}
fclose (noise);
id_hash_free (*noise_ht);
*noise_ht = future_noise_words;
}
caddr_t
Index: libsrc/langfunc/GNUmakefile
===================================================================
RCS file: /home/staff/us-cvs/virtuoso/libsrc/langfunc/GNUmakefile,v
retrieving revision 1.14
diff -u -U 10 -r1.14 GNUmakefile
--- libsrc/langfunc/GNUmakefile 5 Nov 2008 10:22:25 -0000 1.14
+++ libsrc/langfunc/GNUmakefile 12 Jan 2011 14:04:36 -0000
@@ -52,21 +52,21 @@
$(LIBDIR)/liblangfunc.la: liblangfunc.la
$(LIBTOOL) --mode=install $(INSTALL) -m 644 liblangfunc.la $(LIBDIR)
langfunc.o: langfunc.h latin1ctype.h langfunc_templ.c
latin1ctype.o: latin1ctype.h
unicode3.o: unicode3_all_chars.h
-unicode3_all_chars.h : unicode3.dat unicodetab2h.sh
+unicode3_all_chars.h unicode3_basechars.h unicode3_lowers.h unicode3_uppers.h unicode3_spaces.h: unicode3.dat unicodetab2h.sh
. ./unicodetab2h.sh
%.lo: %.c
$(LIBTOOL) --mode=compile $(CC) $(CFLAGS) -I../../Wi -c $<
liblangfunc.la: $(LT_LANGFUNC_OBJ) GNUmakefile unicode3_all_chars.h langfunc.h
$(LIBTOOL) --mode=link $(CC) -static -o liblangfunc.la $(LT_LANGFUNC_OBJ) \
-rpath $(LIBDIR)
.PHONY: generated
Index: libsrc/langfunc/langfunc.c
===================================================================
RCS file: /home/staff/us-cvs/virtuoso/libsrc/langfunc/langfunc.c,v
retrieving revision 1.14
diff -u -U 10 -r1.14 langfunc.c
--- libsrc/langfunc/langfunc.c 5 Aug 2009 05:01:18 -0000 1.14
+++ libsrc/langfunc/langfunc.c 14 Jan 2011 15:05:31 -0000
@@ -15,20 +15,21 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*
*/
#include "langfunc.h"
#include "latin1ctype.h"
+/*#define LH_ITERATOR_DEBUG 1*/
static int unichar_getprops_stub (unichar uchr);
/* The following table is a source for initialization of work_uniblocks.
It contains blocks as they described in Unicode-3. Some of them contains
others, being "parents" of their sub-blocks.
All Unicode functions uses work_uniblocks array. It is initially filled by
dummy data, and should be filled by call of reset_work_uniblocks() function.
Later, language plugins and other units may modify items of work_uniblocks,
@@ -37,21 +38,21 @@
reduce the size of table and thus accelerate search. */
unicode_block_t raw_uniblocks[] = {
/*_______________________________________________________________________________________________________________________*/
/* Group of characters | Props | Min | Max | | | |*/
/*==============================================|===============|===============|===============|=======|=======|=======|*/
{ "Basic Multilingual Plane 0" , UCP_GAP , 0x0000 , 0xFFFF , NULL , NULL , NULL },
{ "A-zone (alphabetic)" , UCP_MIX , 0x0000 , 0x33FF , NULL , NULL , NULL },
{ "General Scripts Area" , UCP_ALPHA , 0x0000 , 0x1FFF , NULL , NULL , NULL },
{ "Basic Latin (US-ASCII)" , UCP_MIX , 0x0000 , 0x007F , NULL , NULL , NULL },
-{ "Latin-1 (ISO-8859-1)" , UCP_MIX , 0x0080 , 0x00FF , NULL , NULL , NULL },
+{ "Latin-1 (ISO-8859-1)" , UCP_MIX , 0x0080 , 0x00FF , NULL , NULL , NULL },
{ "Latin Extended" , UCP_ALPHA , 0x0100 , 0x024F , NULL , NULL , NULL },
{ "IPA Extensions" , UCP_ALPHA , 0x0250 , 0x02AF , NULL , NULL , NULL },
{ "Spacing Modifier Letters" , UCP_ALPHA , 0x02B0 , 0x02FF , NULL , NULL , NULL },
{ "Combining Diacritical Marks" , UCP_ALPHA , 0x0300 , 0x036F , NULL , NULL , NULL },
{ "Greek" , UCP_ALPHA , 0x0370 , 0x03FF , NULL , NULL , NULL },
{ "Cyrillic" , UCP_ALPHA , 0x0400 , 0x04FF , NULL , NULL , NULL },
{ "Armenian" , UCP_ALPHA , 0x0530 , 0x058F , NULL , NULL , NULL },
{ "Hebrew" , UCP_ALPHA , 0x0590 , 0x05FF , NULL , NULL , NULL },
{ "Arabic" , UCP_ALPHA , 0x0600 , 0x06FF , NULL , NULL , NULL },
{ "Syriac" , UCP_ALPHA , 0x0700 , 0x074D , NULL , NULL , NULL },
@@ -123,21 +124,21 @@
{ "Katakana Punctuation" , UCP_PUNCT , 0x30FB , 0x30FE , NULL , NULL , NULL },
{ "Bopomofo" , UCP_ALPHA , 0x3100 , 0x312F , NULL , NULL , NULL },
{ "Hangul Compatibility Jamo" , UCP_MIX , 0x3130 , 0x318F , NULL , NULL , NULL },
{ "Hangul Compatibility Jamo Script" , UCP_ALPHA , 0x3130 , 0x3163 , NULL , NULL , NULL },
{ "Hangul Compatibility Jamo Punctuation" , UCP_PUNCT , 0x3164 , 0x3164 , NULL , NULL , NULL },
{ "Hangul Compatibility Jamo Script" , UCP_ALPHA , 0x3165 , 0x318F , NULL , NULL , NULL },
{ "Kanbun" , UCP_IDEO , 0x3190 , 0x319F , NULL , NULL , NULL },
{ "Enclosed CJK Letters and Months" , UCP_IDEO , 0x3200 , 0x32FF , NULL , NULL , NULL },
{ "CJK Compatibility" , UCP_IDEO , 0x3300 , 0x33FF , NULL , NULL , NULL },
/*----------------------------------------------|---------------|---------------|---------------|-------|-------|-------|*/
-{ "I-zone (ideographic)" , UCP_IDEO , 0x3400 , 0x9FFF , NULL , NULL , NULL },
+{ "I-zone (ideographic)" , UCP_IDEO , 0x3400 , 0x9FFF , NULL , NULL , NULL },
{ "CJK Unified Ideographs, Extension A" , UCP_IDEO , 0x3400 , 0x4DFF , NULL , NULL , NULL }, /* Hangul syllables in Unicode 1, undefined in Unicode 2, ideographs in Unicode 3 */
{ "CJK Unified Ideographs" , UCP_IDEO , 0x4E00 , 0x9FA5 , NULL , NULL , NULL },
/*----------------------------------------------|---------------|---------------|---------------|-------|-------|-------|*/
{ "O-zone (other)" , UCP_PUNCT , 0xA000 , 0xD7FF , NULL , NULL , NULL },
{ "Yi" , UCP_ALPHA , 0xA000 , 0xA4C8 , NULL , NULL , NULL },
{ "Hangul syllables" , UCP_ALPHA , 0xAC00 , 0xD7A3 , NULL , NULL , NULL },
{ "S-zone (surrogates)" , UCP_PUNCT , 0xD800 , 0xDFFF , NULL , NULL , NULL },
{ "High Surrogates" , UCP_PUNCT , 0xD800 , 0xDBFF , NULL , NULL , NULL },
{ "Low Surrogates" , UCP_PUNCT , 0xDC00 , 0xDFFF , NULL , NULL , NULL },
{ "R-zone (reserved)" , UCP_GAP , 0xE000 , 0xFFFD , NULL , NULL , NULL },
@@ -525,64 +526,131 @@
{
size_t ctr;
if (WORD_MAX_CHARS < srcbufsize)
return 0;
for (ctr = 0; ctr < srcbufsize; ctr++)
tgtbuf[ctr] = unichar_getlcase (srcbuf[ctr]);
tgtbufsize[0] = srcbufsize;
return 1;
}
+int lh_xany_normalization_flags = 0;
int lh_normalize_word__xany (const unichar *srcbuf, size_t srcbufsize, unichar *tgtbuf, size_t *tgtbufsize)
{
- size_t ctr;
+ size_t ctr, tgt_count, srcsz1;
/* int isspecial = 0; */
- if (WORD_MAX_CHARS < srcbufsize)
+ if ((WORD_MAX_CHARS < srcbufsize) || (1 > srcbufsize))
return 0;
- for (ctr = 0; ctr < srcbufsize; ctr++)
+ switch (lh_xany_normalization_flags & (LH_XANY_NORMALIZATION_COMBINE | LH_XANY_NORMALIZATION_TOBASE))
{
- unichar ucased = unichar_getucase (srcbuf[ctr]);
- tgtbuf[ctr] = ucased;
- /* if (ucased < 'A')
- isspecial = 1; */
+ case LH_XANY_NORMALIZATION_COMBINE | LH_XANY_NORMALIZATION_TOBASE:
+ tgt_count = 0;
+ srcsz1 = srcbufsize-1;
+ for (ctr = 0; ctr < srcsz1; ctr++)
+ {
+ unichar u = srcbuf[ctr];
+ unichar next = srcbuf[ctr+1];
+ unichar res;
+ if ((next >= unicode3_min_used_modif_char) && (next <= unicode3_max_used_modif_char))
+ {
+ res = unicode3_combine_base_and_modif_upper (u, next);
+ if (res)
+ {
+ tgtbuf[tgt_count++] = unicode3_getupperbasechar (res);
+ ctr++;
+ continue;
+ }
+ }
+ res = unicode3_getupperbasechar (u);
+ tgtbuf[tgt_count++] = res;
+ }
+ if (ctr < srcbufsize)
+ tgtbuf[tgt_count++] = unicode3_getupperbasechar (srcbuf[ctr]);
+ break;
+ case LH_XANY_NORMALIZATION_COMBINE:
+ tgt_count = 0;
+ srcsz1 = srcbufsize-1;
+ for (ctr = 0; ctr < srcsz1; ctr++)
+ {
+ unichar u = srcbuf[ctr];
+ unichar next = srcbuf[ctr+1];
+ unichar res;
+ if ((next >= unicode3_min_used_modif_char) && (next <= unicode3_max_used_modif_char))
+ {
+ res = unicode3_combine_base_and_modif_upper (u, next);
+ if (res)
+ {
+ tgtbuf[tgt_count++] = unicode3_getupperbasechar (res);
+ ctr++;
+ continue;
+ }
+ }
+ res = unicode3_getupperbasechar (u);
+ tgtbuf[tgt_count++] = res;
+ }
+ if (ctr < srcbufsize)
+ tgtbuf[tgt_count++] = unichar_getucase (srcbuf[ctr]);
+ break;
+ case LH_XANY_NORMALIZATION_TOBASE:
+ for (ctr = 0; ctr < srcbufsize; ctr++)
+ {
+ unichar u = srcbuf[ctr];
+ u = unicode3_getupperbasechar (u);
+ tgtbuf[ctr] = u;
+ /* if (u < 'A')
+ isspecial = 1; */
+ }
+ tgt_count = srcbufsize;
+ break;
+ case 0:
+ for (ctr = 0; ctr < srcbufsize; ctr++)
+ {
+ unichar u = srcbuf[ctr];
+ u = unichar_getucase (u);
+ tgtbuf[ctr] = u;
+ /* if (u < 'A')
+ isspecial = 1; */
+ }
+ tgt_count = srcbufsize;
+ break;
}
#if 0 /* This is commented out because this plural-to-single is not fully valid */
- if (isspecial || (srcbufsize < 3) || ('S' != tgtbuf[srcbufsize - 1]) || ('S' == tgtbuf[srcbufsize - 2]))
+ if (isspecial || (tgt_count < 3) || ('S' != tgtbuf[tgt_count - 1]) || ('S' == tgtbuf[tgt_count - 2]))
{ /* Special or singular */
- tgtbufsize[0] = srcbufsize;
+ tgtbufsize[0] = tgt_count;
return 1;
}
- if ('E' == tgtbuf[srcbufsize - 2])
+ if ('E' == tgtbuf[tgt_count - 2])
{ /* "...ES" plural */
- if ((3 == srcbufsize) && ('Y' == tgtbuf[0]))
+ if ((3 == tgt_count) && ('Y' == tgtbuf[0]))
{ /* "YES" is singular */
- tgtbufsize[0] = srcbufsize;
+ tgtbufsize[0] = tgt_count;
return 1;
}
- if ('I' == tgtbuf[srcbufsize - 3])
+ if ('I' == tgtbuf[tgt_count - 3])
{ /* "...IES" plural */
- tgtbuf[srcbufsize - 3] = 'Y';
- tgtbufsize[0] = srcbufsize - 2;
+ tgtbuf[tgt_count - 3] = 'Y';
+ tgtbufsize[0] = tgt_count - 2;
return 1;
}
- if ('S' == tgtbuf[srcbufsize - 3])
+ if ('S' == tgtbuf[tgt_count - 3])
{ /* "...SES" plural */
- tgtbufsize[0] = srcbufsize - 2;
+ tgtbufsize[0] = tgt_count - 2;
return 1;
}
- tgtbufsize[0] = srcbufsize - 1;
+ tgtbufsize[0] = tgt_count - 1;
return 1;
}
- tgtbufsize[0] = srcbufsize - 1; /* "...S" plural */
+ tgtbufsize[0] = tgt_count - 1; /* "...S" plural */
#else
- tgtbufsize[0] = srcbufsize;
+ tgtbufsize[0] = tgt_count;
#endif
return 1;
}
#define LH_COUNT_WORDS_NAME lh_count_words__xany
#define LH_ITERATE_WORDS_NAME lh_iterate_words__xany
#define LH_ITERATE_PATCHED_WORDS_NAME lh_iterate_patched_words__xany
#define UNICHAR_GETPROPS_EXPN(buf,bufsize,pos) (unichar_getprops(buf[pos]))
#define DBG_PRINTF_NOISE_WORD(word_start,word_length) dbg_printf (("Noise word in text, start %ld, length %ld\n", (long)word_start, (long)word_length))
Index: libsrc/langfunc/langfunc.h
===================================================================
RCS file: /home/staff/us-cvs/virtuoso/libsrc/langfunc/langfunc.h,v
retrieving revision 1.16
diff -u -U 10 -r1.16 langfunc.h
--- libsrc/langfunc/langfunc.h 5 Feb 2007 09:07:10 -0000 1.16
+++ libsrc/langfunc/langfunc.h 13 Jan 2011 13:35:18 -0000
@@ -236,22 +236,35 @@
/*! \brief Resets internal table of blocks to some initial state */
int reset_work_uniblocks(void);
/*! \brief Returns block containing given unichar, or NULL for invalid unichar */
unicode_block_t *ub_getblock(unichar uchr);
/*! \brief Returns given unichar uppercased, based on data from Unicode3 tables */
extern unichar unicode3_getucase (unichar uchr);
/*! \brief Returns given unichar lowercased, based on data from Unicode3 tables */
extern unichar unicode3_getlcase (unichar uchr);
+/*! \brief Returns given unichar converted to a base char (i.e. remove umlauts, accents etc.) */
+extern unichar unicode3_getbasechar (unichar uchr);
+/*! \brief An accelerated superposition of unicode3_getbasechar and then unicode3_getucase */
+extern unichar unicode3_getupperbasechar (unichar uchr);
+/*! \brief Returns a char that is combination of a base char and NSM modifier, i.e. slightly "inverse" to unicode3_getbasechar */
+extern unichar unicode3_combine_base_and_modif (unichar base, unichar modif);
+/*! \brief An accelerated superposition of unicode3_combine_base_and_modif and then unicode3_getucase */
+extern unichar unicode3_combine_base_and_modif_upper (unichar base, unichar modif);
/*! \brief Returns if given unichar is a 'logical space' character */
extern int unicode3_isspace (unichar uchr);
+/*! \brief The minimal nonspacing modifier (NSM) char like umlaut or accent to modify other character */
+extern unichar unicode3_min_used_modif_char;
+/*! \brief The maximal nonspacing modifier (NSM) char like umlaut or accent to modify other character. Not every char between \c unicode3_min_used_modif_char and this one is an NSM, but all NSMs actually used as modifiers falls in this interval */
+extern unichar unicode3_max_used_modif_char;
+
/*! \brief Returns properties of unichar */
EXE_EXPORT (int, unichar_getprops, (unichar uchr));
/*! \brief Returns given unichar uppercased, faster than unichar3_getucase(), but maybe less accurate */
EXE_EXPORT (unichar, unichar_getucase, (unichar uchr));
/*! \brief Returns given unichar lowercased, faster than unichar3_getlcase(), but maybe less accurate */
EXE_EXPORT (unichar, unichar_getlcase, (unichar uchr));
/*! \brief Returns nonzero if given unichar is alphabetical character */
#define IS_UNICHAR_ALPHA(uchr) (unichar_getprops(uchr) & UCP_ALPHA)
@@ -649,20 +662,25 @@
extern encoding_handler_t eh__ISO8859_1;
/*! \brief Handler of "WIDE identity" encoding */
extern encoding_handler_t eh__WIDE_121;
/* These functions require pointer to encoding as additional argument */
extern eh_decode_char_t eh_decode_char__charset;
extern eh_encode_char_t eh_encode_char__charset;
extern eh_decode_buffer_t eh_decode_buffer__charset;
extern eh_encode_buffer_t eh_encode_buffer__charset;
+/*! \brief x-any language handler can normalize combined characters in different ways, depending on this variable */
+extern int lh_xany_normalization_flags;
+#define LH_XANY_NORMALIZATION_COMBINE 0x1 /*!< Any pair of base char and combinig char (NSM, non-spacing modifier) is replaced with a single combined char */
+#define LH_XANY_NORMALIZATION_TOBASE 0x2 /*!< Any combined char is converted to its (smallest known) base. If bit LH_XANY_NORMALIZATION_COMBINE is also set, pair of base char and combinig char loses its second char */
+#define LH_XANY_NORMALIZATION_FULL 0xFF /*!< More flags may appear in the future */
/*! \brief Language handler for "x-any" language, used for unknown/unspecified languages */
extern lang_handler_t lh__xany;
/*! \brief Language handler for "x-ftq-x-any" language, used as free-text-query language for unknown/unspecified languages */
extern lang_handler_t lh__xftqxany;
/* No real need in accelerated handlers of "UCS-4BE" or "UCS-4LE" encoded text on "x-any" language */
/*! \brief Handler of "UTF-16BE" encoded text on "x-any" language */
extern encodedlang_handler_t elh__xany__UTF16BE;
/*! \brief Handler of "UTF-16LE" encoded text on "x-any" language */
Index: libsrc/langfunc/langfunc_templ.c
===================================================================
RCS file: /home/staff/us-cvs/virtuoso/libsrc/langfunc/langfunc_templ.c,v
retrieving revision 1.5
diff -u -U 10 -r1.5 langfunc_templ.c
--- libsrc/langfunc/langfunc_templ.c 13 Jul 2006 07:59:06 -0000 1.5
+++ libsrc/langfunc/langfunc_templ.c 14 Jan 2011 12:35:36 -0000
@@ -61,79 +61,98 @@
void LH_ITERATE_WORDS_NAME(const unichar *buf, size_t bufsize, lh_word_check_t *check, lh_word_callback_t *callback, void *userdata)
{
size_t pos = 0;
size_t word_start;
size_t word_length;
utf8char word_buf[BUFSIZEOF__UTF8_WORD];
utf8char *hugeword_buf = NULL;
size_t hugeword_buf_size = 0;
utf8char *word_end;
int prop;
+#ifdef LH_ITERATOR_DEBUG
+ int wordctr = 0, wordcount = LH_COUNT_WORDS_NAME (buf, bufsize, check);
+#define wordctr_INC1 wordctr++
+#else
+#define wordctr_INC1
+#endif
while (pos < bufsize)
{
prop = UNICHAR_GETPROPS_EXPN (buf, bufsize, pos);
if (prop & UCP_ALPHA)
{
word_start = pos;
do pos++; while ((pos < bufsize) && (UNICHAR_GETPROPS_EXPN (buf, bufsize, pos) & UCP_ALPHA));
word_length = pos - word_start;
if (WORD_MAX_CHARS < word_length)
continue;
if (NULL!=check && 0 == check(buf+word_start, word_length))
continue;
word_end = (utf8char *)eh_encode_buffer__UTF8 (buf+word_start, buf+pos, (char *)word_buf, (char *)(word_buf+BUFSIZEOF__UTF8_WORD));
if (NULL != word_end)
{
callback (word_buf, word_end-word_buf, userdata);
+ wordctr_INC1;
continue;
}
if (hugeword_buf_size<(word_length*MAX_UTF8_CHAR))
{
if (hugeword_buf_size)
dk_free (hugeword_buf, hugeword_buf_size);
hugeword_buf_size = word_length*MAX_UTF8_CHAR;
hugeword_buf = (utf8char *) dk_alloc (hugeword_buf_size);
}
word_end = (utf8char *)eh_encode_buffer__UTF8 (buf+word_start, buf+pos, (char *)hugeword_buf, (char *)(hugeword_buf+hugeword_buf_size));
callback (hugeword_buf, word_end-hugeword_buf, userdata);
+ wordctr_INC1;
continue;
}
if (prop & UCP_IDEO)
{
word_start = pos;
pos++;
if (NULL!=check && 0 == check(buf+pos-1, 1))
continue;
word_end = (utf8char *)eh_encode_buffer__UTF8 (buf+word_start, buf+pos, (char *)(word_buf), (char *)(word_buf+BUFSIZEOF__UTF8_WORD));
callback (word_buf, word_end-word_buf, userdata);
+ wordctr_INC1;
continue;
}
pos++;
}
if (hugeword_buf_size)
dk_free (hugeword_buf, hugeword_buf_size);
+#ifdef LH_ITERATOR_DEBUG
+ if (wordctr != wordcount)
+ GPF_T;
+#endif
}
void LH_ITERATE_PATCHED_WORDS_NAME(const unichar *buf, size_t bufsize, lh_word_check_t *check, lh_word_patch_t *patch, lh_word_callback_t *callback, void *userdata)
{
size_t pos = 0;
size_t word_start;
size_t word_length;
unichar patch_buf[WORD_MAX_CHARS];
const unichar *arg_begin;
size_t arg_length;
utf8char word_buf[BUFSIZEOF__UTF8_WORD];
utf8char *hugeword_buf = NULL;
size_t hugeword_buf_size = 0;
utf8char *word_end;
int prop;
+#ifdef LH_ITERATOR_DEBUG
+ int wordctr = 0, wordcount = LH_COUNT_WORDS_NAME (buf, bufsize, check);
+#define wordctr_INC1 wordctr++
+#else
+#define wordctr_INC1
+#endif
while (pos < bufsize)
{
prop = UNICHAR_GETPROPS_EXPN(buf,bufsize,pos);
if (prop & UCP_ALPHA)
{
word_start = pos;
do pos++; while ((pos < bufsize) && (UNICHAR_GETPROPS_EXPN(buf,bufsize,pos) & UCP_ALPHA));
word_length = pos - word_start;
if (WORD_MAX_CHARS < word_length)
continue;
@@ -153,31 +172,33 @@
}
else
{ /* argument should be taken right from \c buf */
arg_begin = buf+word_start;
arg_length = word_length;
}
word_end = (utf8char *)eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, (char *)(word_buf), (char *)(word_buf+BUFSIZEOF__UTF8_WORD));
if (NULL != word_end)
{
callback (word_buf, word_end-word_buf, userdata);
+ wordctr_INC1;
continue;
}
if (hugeword_buf_size<(word_length*MAX_UTF8_CHAR))
{ /* overflow danger detected */
if (hugeword_buf_size)
dk_free (hugeword_buf, hugeword_buf_size);
hugeword_buf_size = word_length*MAX_UTF8_CHAR;
hugeword_buf = (utf8char *) dk_alloc (hugeword_buf_size);
}
word_end = (utf8char *)eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, (char *)(hugeword_buf), (char *)(hugeword_buf+hugeword_buf_size));
callback (hugeword_buf, word_end-hugeword_buf, userdata);
+ wordctr_INC1;
continue;
}
if (prop & UCP_IDEO)
{
word_start = pos;
pos++;
word_length = pos - word_start;
if (NULL!=check && 0 == check(buf+word_start, word_length))
{
DBG_PRINTF_NOISE_IDEO(word_start,word_length);
@@ -192,18 +213,23 @@
}
arg_begin = patch_buf;
}
else
{ /* argument should be taken right from \c buf */
arg_begin = buf+word_start;
arg_length = word_length;
}
word_end = (utf8char *)eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, (char *)(word_buf), (char *)(word_buf+BUFSIZEOF__UTF8_WORD));
callback (word_buf, word_end-word_buf, userdata);
+ wordctr_INC1;
continue;
}
pos++;
}
if (hugeword_buf_size)
dk_free (hugeword_buf, hugeword_buf_size);
+#ifdef LH_ITERATOR_DEBUG
+ if (wordctr != wordcount)
+ GPF_T;
+#endif
}
Index: libsrc/langfunc/langman.c
===================================================================
RCS file: /home/staff/us-cvs/virtuoso/libsrc/langfunc/langman.c,v
retrieving revision 1.24
diff -u -U 10 -r1.24 langman.c
--- libsrc/langfunc/langman.c 2 Feb 2009 21:49:53 -0000 1.24
+++ libsrc/langfunc/langman.c 12 Jan 2011 20:06:08 -0000
@@ -307,34 +307,36 @@
/*! \brief Type of function registered via plugin_add_type and used by
plugin_load to invoke uv_connect of a plugin with proper appdata */
void langfunc_plugin_connect (const unit_version_t *plugin)
{
UV_CALL (plugin, uv_connect, NULL);
}
#endif
+extern void unicode3_init_char_combining_hashtables (void);
extern eh_charset_t eh_generic_chardefs[];
extern int eh_generic_chardefs_length;
extern void connect__enUS (void *appdata);
extern void connect__xViDoc (void *appdata);
extern void connect__xViAny (void *appdata);
void langfunc_kernel_init (void)
{
static int done = 0;
int ctr;
if (done)
return;
done = 1;
reset_work_uniblocks ();
+ unicode3_init_char_combining_hashtables ();
eh_load_handler (&eh__UCS4);
eh_load_handler (&eh__UCS4BE);
eh_load_handler (&eh__UCS4LE);
eh_load_handler (&eh__UTF16);
eh_load_handler (&eh__UTF16BE);
eh_load_handler (&eh__UTF16LE);
eh_load_handler (&eh__UTF8);
eh_load_handler (&eh__UTF8_QR);
eh_load_handler (&eh__ASCII);
eh_load_handler (&eh__ISO8859_1);
Index: libsrc/langfunc/unicode3.c
===================================================================
RCS file: /home/staff/us-cvs/virtuoso/libsrc/langfunc/unicode3.c,v
retrieving revision 1.4
diff -u -U 10 -r1.4 unicode3.c
--- libsrc/langfunc/unicode3.c 24 Mar 2006 13:21:47 -0000 1.4
+++ libsrc/langfunc/unicode3.c 13 Jan 2011 23:07:13 -0000
@@ -14,20 +14,21 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*
*/
#include "langfunc.h"
+#include "Dkhash.h"
/* Conversion to uppercase */
struct unicode3_toupper_s
{
unsigned short int u3c_cell;
unsigned short int u3c_ucase;
};
@@ -98,21 +99,174 @@
if (uchr < curr)
right_cop = robber-1;
else
left_cop = robber+1;
robber = (left_cop+right_cop)/2;
}
return uchr;
}
-/* Conversion to lowercase */
+/* Conversion to base char */
+
+struct unicode3_tobasechar_s
+{
+ unsigned short int u3c_cell;
+ unsigned short int u3c_basechar;
+ unsigned short int u3c_upperbasechar;
+ unsigned short int u3c_modifier;
+};
+
+typedef struct unicode3_tobasechar_s unicode3_tobasechar_t;
+
+#define UNICODE3_HEADER unicode3_tobasechar_t unicode3_tobasechars[] = {
+#define UNICODE3_FOOTER {~0} };
+#define UNICODE3_S2(mode,base,modif) base, base, modif
+#define UNICODE3_S1(mode,base) base, base, 0
+#define UNICODE3_REC(cell,grp1,idx1,grp2,digit1,digit2,digit3,ucase1,lcase,ucase2,flg1,repl1,name1,name2,unicodename) \
+{cell,repl1},
+#include "unicode3_basechars.h"
+#undef UNICODE3_HEADER
+#undef UNICODE3_FOOTER
+#undef UNICODE3_REC
+
+
+unichar unicode3_getbasechar (unichar uchr)
+{
+ int left_cop = 0;
+ int right_cop = (int)(sizeof(unicode3_tobasechars)/sizeof(unicode3_tobasechars[0])) - 1;
+ int robber = (252-1); /* We should optimize search for the most important case - 252(!) different modified Latin chars */
+ unichar curr;
+ while (left_cop <= right_cop)
+ {
+ curr = unicode3_tobasechars[robber].u3c_cell;
+ if (uchr == curr)
+ return unicode3_tobasechars[robber].u3c_basechar;
+ if (uchr < curr)
+ right_cop = robber-1;
+ else
+ left_cop = robber+1;
+ robber = (left_cop+right_cop)/2;
+ }
+ return uchr;
+}
+
+unichar unicode3_getupperbasechar (unichar uchr)
+{
+ int left_cop = 0;
+ int right_cop = (int)(sizeof(unicode3_tobasechars)/sizeof(unicode3_tobasechars[0])) - 1;
+ int robber = (252-1); /* We should optimize search for the most important case - 252(!) different modified Latin chars */
+ unichar curr;
+ while (left_cop <= right_cop)
+ {
+ curr = unicode3_tobasechars[robber].u3c_cell;
+ if (uchr == curr)
+ return unicode3_tobasechars[robber].u3c_upperbasechar;
+ if (uchr < curr)
+ right_cop = robber-1;
+ else
+ left_cop = robber+1;
+ robber = (left_cop+right_cop)/2;
+ }
+ return unicode3_getucase (uchr);
+}
+
+/* Conversion from a base char and modif to a single combined char */
+
+dk_hash_t *unicode3_modif_usages = NULL;
+dk_hash_t *unicode3_charmodif_to_combined = NULL;
+dk_hash_t *unicode3_charmodif_to_combined_upper = NULL;
+unichar unicode3_min_used_modif_char = 0xFFFF, unicode3_max_used_modif_char = 0;
+unichar unicode3_min_exact_clone_char = 0xFFFF, unicode3_max_exact_clone_char = 0;
+
+unichar unicode3_combine_base_and_modif (unichar base, unichar modif)
+{
+ uptrlong boundaries = gethash ((void *)((ptrlong)modif), unicode3_modif_usages);
+ if (boundaries && (base >= (boundaries >> 16)) && (base <= (boundaries & 0xFFFF)))
+ {
+ uptrlong mix = (base << 16) | modif;
+ uptrlong combined = gethash ((void *)((ptrlong)mix), unicode3_charmodif_to_combined);
+ return combined;
+ }
+ return 0;
+}
+
+unichar unicode3_combine_base_and_modif_upper (unichar base, unichar modif)
+{
+ uptrlong boundaries = gethash ((void *)((ptrlong)modif), unicode3_modif_usages);
+ if (boundaries && (base >= (boundaries >> 16)) && (base <= (boundaries & 0xFFFF)))
+ {
+ uptrlong mix = (base << 16) | modif;
+ uptrlong combined_upper = gethash ((void *)((ptrlong)mix), unicode3_charmodif_to_combined_upper);
+ return combined_upper;
+ }
+ return 0;
+}
+
+void unicode3_init_char_combining_hashtables (void)
+{
+ int cellctr;
+ int cellcount = (int)(sizeof(unicode3_tobasechars)/sizeof(unicode3_tobasechars[0])) - 1;
+ if (NULL != unicode3_modif_usages)
+ return;
+ unicode3_modif_usages = hash_table_allocate (509);
+ unicode3_charmodif_to_combined = hash_table_allocate (1531);
+ unicode3_charmodif_to_combined_upper = hash_table_allocate (1531);
+ for (cellctr = 0; cellctr - cellcount; cellctr++)
+ {
+ unicode3_tobasechar_t *rec = unicode3_tobasechars + cellctr;
+ uptrlong cell = rec->u3c_cell;
+ unichar modif = rec->u3c_modifier;
+ uptrlong mix, old_cell_for_mix, boundaries, boundaries_min, boundaries_max;
+ rec->u3c_upperbasechar = unicode3_getucase (rec->u3c_basechar);
+ if (0 == modif)
+ {
+ if (cell < unicode3_min_exact_clone_char)
+ unicode3_min_exact_clone_char = cell;
+ if (cell > unicode3_max_exact_clone_char)
+ unicode3_max_exact_clone_char = cell;
+ continue;
+ }
+ if (modif < unicode3_min_used_modif_char)
+ unicode3_min_used_modif_char = modif;
+ if (modif > unicode3_max_used_modif_char)
+ unicode3_max_used_modif_char = modif;
+ mix = (rec->u3c_basechar << 16) | modif;
+ old_cell_for_mix = gethash ((void *)mix, unicode3_charmodif_to_combined);
+ if (old_cell_for_mix)
+ {
+#ifndef NDEBUG
+ GPF_T;
+#endif
+ continue;
+ }
+ sethash ((void *)mix, unicode3_charmodif_to_combined, (void *)cell);
+ sethash ((void *)mix, unicode3_charmodif_to_combined_upper, (void *)(unicode3_getucase (cell)));
+ boundaries = gethash ((void *)((ptrlong)(modif)), unicode3_modif_usages);
+ if (!boundaries)
+ {
+ boundaries_min = boundaries_max = rec->u3c_basechar;
+ }
+ else
+ {
+ boundaries_min = boundaries >> 16;
+ boundaries_max = boundaries & 0xFFFF;
+ if (rec->u3c_basechar < boundaries_min)
+ boundaries_min = rec->u3c_basechar;
+ if (rec->u3c_basechar > boundaries_max)
+ boundaries_max = rec->u3c_basechar;
+ }
+ sethash ((void *)((ptrlong)(modif)), unicode3_modif_usages, ((void*)((boundaries_min << 16) | boundaries_max)));
+ }
+}
+
+/* Check for being a whitespace */
#define UNICODE3_HEADER unichar unicode3_spaces[] = {
#define UNICODE3_FOOTER ~0 };
#define UNICODE3_REC(cell,grp1,idx1,grp2,digit1,digit2,digit3,ucase1,lcase,ucase2,flg1,repl1,name1,name2,unicodename) \
cell,
#include "unicode3_spaces.h"
#undef UNICODE3_HEADER
#undef UNICODE3_FOOTER
#undef UNICODE3_REC
Index: libsrc/langfunc/unicodetab2h.sh
===================================================================
RCS file: /home/staff/us-cvs/virtuoso/libsrc/langfunc/unicodetab2h.sh,v
retrieving revision 1.7
diff -u -U 10 -r1.7 unicodetab2h.sh
--- libsrc/langfunc/unicodetab2h.sh 14 Apr 2009 11:39:26 -0000 1.7
+++ libsrc/langfunc/unicodetab2h.sh 12 Jan 2011 14:29:01 -0000
@@ -100,16 +100,20 @@
echo -n "Search for lowecase characters... "
grep ";[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$" < unicode3.dat > tmp/tmp1.c
echo " 100% done."
echo -n "Search for uppercase characters... "
grep ";[0-9A-F][0-9A-F][0-9A-F][0-9A-F];$" < unicode3.dat > tmp/tmp2.c
echo " 100% done."
echo -n "Search for 'logical space' characters... "
grep ";0;WS;" < unicode3.dat > tmp/tmp3.c
echo " 100% done."
+echo -n "Search for modified/compatible characters... "
+grep -E "^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[0-9A-F][0-9A-F][0-9A-F][0-9A-F](( [0-9A-F][0-9A-F][0-9A-F][0-9A-F])?);" < unicode3.dat > tmp/tmp4.c
+echo " 100% done."
tabtoheader tmp/tmp1.c unicode3_lowers.h
tabtoheader tmp/tmp2.c unicode3_uppers.h
tabtoheader tmp/tmp3.c unicode3_spaces.h
+tabtoheader tmp/tmp4.c unicode3_basechars.h
tabtoheader unicode3.dat unicode3_all_chars.h
rm -rf tmp