Hi, I tried to work out how utf-8 locales can work in sword. I noticed that there are 2 toupper() functions in sword, one of which only works on latin1, and the second can utilize icu. Therefore I created a patch to be able to handle utf-8 consistently in sword by deleting toupper() and patching toupper_utf8() a little. Please look through and see if this is ok.
It should work, I only have the problem that ICU does not do the toUpper() correctly on my system right now, not sure why. It leaves the string as it is. Chris, can you help me here? Does it work for you? Thanks for all feedback. Martin
Index: debian/rules =================================================================== RCS file: /cvs/core/sword/debian/rules,v retrieving revision 1.7 diff -u -3 -p -u -r1.7 rules --- debian/rules 17 Jan 2004 21:21:13 -0000 1.7 +++ debian/rules 18 Jan 2004 15:26:04 -0000 @@ -34,7 +34,7 @@ configure-stamp: chmod 755 configure ./configure $(confflags) --prefix=/usr --mandir=\$${prefix}/share/man \ --infodir=\$${prefix}/share/info --with-zlib \ - --sysconfdir=/etc --enable-shared --without-icu \ + --sysconfdir=/etc --enable-shared --with-icu \ --without-lucene touch configure.stamp Index: include/utilstr.h =================================================================== RCS file: /cvs/core/sword/include/utilstr.h,v retrieving revision 1.11 diff -u -3 -p -u -r1.11 utilstr.h --- include/utilstr.h 22 Jun 2003 23:50:23 -0000 1.11 +++ include/utilstr.h 18 Jan 2004 15:26:04 -0000 @@ -33,7 +33,7 @@ char *strstrip (char *istr); const char *stristr (const char *s1, const char *s2); const char strnicmp(const char *s1, const char *s2, int len); unsigned int strlenw(const char *s1); -char *toupperstr(char *buf); +//char *toupperstr(char *buf); char *toupperstr_utf8(char *buf, unsigned int max = 0); /* Index: src/keys/versekey.cpp =================================================================== RCS file: /cvs/core/sword/src/keys/versekey.cpp,v retrieving revision 1.58 diff -u -3 -p -u -r1.58 versekey.cpp --- src/keys/versekey.cpp 27 Jun 2003 01:41:07 -0000 1.58 +++ src/keys/versekey.cpp 18 Jan 2004 15:26:05 -0000 @@ -324,7 +324,7 @@ int VerseKey::getBookAbbrev(const char * stdstr(&abbr, iabbr); strstrip(abbr); if (!i) - toupperstr(abbr); + toupperstr_utf8(abbr); abLen = strlen(abbr); if (abLen) { Index: src/modules/filters/swbasicfilter.cpp =================================================================== RCS file: /cvs/core/sword/src/modules/filters/swbasicfilter.cpp,v retrieving revision 1.33 diff -u -3 -p -u -r1.33 swbasicfilter.cpp --- src/modules/filters/swbasicfilter.cpp 24 Oct 2003 02:43:46 -0000 1.33 +++ src/modules/filters/swbasicfilter.cpp 18 Jan 2004 15:26:05 -0000 @@ -93,7 +93,7 @@ void SWBasicFilter::addTokenSubstitute(c if (!tokenCaseSensitive) { stdstr(&buf, findString); - toupperstr(buf); + toupperstr_utf8(buf); tokenSubMap[buf] = replaceString; delete [] buf; } @@ -114,7 +114,7 @@ void SWBasicFilter::addEscapeStringSubst if (!escStringCaseSensitive) { stdstr(&buf, findString); - toupperstr(buf); + toupperstr_utf8(buf); escSubMap.insert(DualStringMap::value_type(buf, replaceString)); delete [] buf; } @@ -135,7 +135,7 @@ bool SWBasicFilter::substituteToken(SWBu if (!tokenCaseSensitive) { char *tmp = 0; stdstr(&tmp, token); - toupperstr(tmp); + toupperstr_utf8(tmp); it = tokenSubMap.find(tmp); delete [] tmp; } else @@ -155,7 +155,7 @@ bool SWBasicFilter::substituteEscapeStri if (!escStringCaseSensitive) { char *tmp = 0; stdstr(&tmp, escString); - toupperstr(tmp); + toupperstr_utf8(tmp); it = escSubMap.find(tmp); delete [] tmp; } else Index: src/modules/texts/rawtext/rawtext.cpp =================================================================== RCS file: /cvs/core/sword/src/modules/texts/rawtext/rawtext.cpp,v retrieving revision 1.69 diff -u -3 -p -u -r1.69 rawtext.cpp --- src/modules/texts/rawtext/rawtext.cpp 17 Jan 2004 04:33:25 -0000 1.69 +++ src/modules/texts/rawtext/rawtext.cpp 18 Jan 2004 15:26:06 -0000 @@ -282,7 +282,7 @@ signed char RawText::createSearchFramewo while (word) { // make word upper case - toupperstr(word); + toupperstr_utf8(word); // lookup word in dictionary (or make entry in dictionary // for this word) and add this module position (index) to @@ -519,7 +519,7 @@ ListKey &RawText::search(const char *ist // toupper our copy of search string stdstr(&wordBuf, istr); - toupperstr(wordBuf); + toupperstr_utf8(wordBuf); // get list of individual words words = (char **)calloc(sizeof(char *), 10); Index: src/utilfuns/utilstr.cpp =================================================================== RCS file: /cvs/core/sword/src/utilfuns/utilstr.cpp,v retrieving revision 1.25 diff -u -3 -p -u -r1.25 utilstr.cpp --- src/utilfuns/utilstr.cpp 27 Jun 2003 02:21:05 -0000 1.25 +++ src/utilfuns/utilstr.cpp 18 Jan 2004 15:26:06 -0000 @@ -1,6 +1,7 @@ #include <utilstr.h> #include <ctype.h> #include <string.h> +#include <iostream> #ifdef _ICU_ #include <unicode/utypes.h> @@ -147,26 +148,29 @@ unsigned int strlenw(const char *s1) { } -/****************************************************************************** - * toupperstr - converts a string to uppercase string - * - * ENT: target - string to convert - * - * RET: target - */ - -char *toupperstr(char *buf) { - char *ret = buf; - - while (*buf) - *buf = SW_toupper(*buf++); - - return ret; -} +///****************************************************************************** +// * toupperstr - converts a string to uppercase string +// * +// * ENT: target - string to convert +// * +// * RET: target +// */ +// +//char *toupperstr(char *buf) { +// char *ret = buf; +// +// while (*buf) +// *buf = SW_toupper(*buf++); +// +// return ret; +//} /****************************************************************************** - * toupperstr - converts a string to uppercase string + * toupperstr_utf8 - converts a string to uppercase string + * If ICU support is enabled in sword, this function will use it to do the work. + * If ICU support is not enabled, this function will ONLY work correctly with + * Latin-1 data! * * ENT: target - string to convert * @@ -179,23 +183,26 @@ char *toupperstr_utf8(char *buf, unsigne #ifndef _ICU_ // try to decide if it's worth trying to toupper. Do we have more // characters that are probably lower latin than not? - long performOp = 0; - for (const char *ch = buf; *ch; ch++) - performOp += (*ch > 0) ? 1 : -1; - if (performOp) { +//mgruner: WHAT IS THIS CODE FOR? TOUPPER IS SUPPOSED TO ALWAYS WORK... +// long performOp = 0; +// for (const char *ch = buf; *ch; ch++) +// performOp += (*ch > 0) ? 1 : -1; +// +// if (performOp) { while (*buf) *buf = SW_toupper(*buf++); - } +// } #else if (!max) max = strlen(ret); - UErrorCode err = U_ZERO_ERROR; - UConverter *conv = ucnv_open("UTF-8", &err); - UnicodeString str(buf, -1, conv, err); - UnicodeString ustr = str.toUpper(); - ustr.extract(ret, max, conv, err); - ucnv_close(conv); + + UErrorCode err = U_ZERO_ERROR; + UConverter *conv = ucnv_open("UTF-8", &err); + UnicodeString str(buf, -1, conv, err); + UnicodeString ustr = str.toUpper(); + ustr.extract(ret, max, conv, err); + ucnv_close(conv); #endif return ret;