UTF-8: New function unicode_fold_label_case and a related script. --- commit 9e1fcebe33cf251fea537c9a06107004d9ba729b tree ebff9dc7b5806a2e741dcc8bb379c42ac35f6796 parent 49f5b0819eefd68261ea2f2aa2bbca166dd0ae4a author Kalle Olavi Niemitalo <[EMAIL PROTECTED]> Sat, 05 Aug 2006 19:45:53 +0300 committer Kalle Olavi Niemitalo <[EMAIL PROTECTED]> Sat, 05 Aug 2006 20:35:02 +0300
Unicode/gen-case | 137 +++++++++++++++++++++++++++++++++++++++++++++++++++
src/intl/charsets.c | 24 +++++++++
src/intl/charsets.h | 1
3 files changed, 162 insertions(+), 0 deletions(-)
diff --git a/Unicode/gen-case b/Unicode/gen-case
new file mode 100755
index 0000000..e67037b
--- /dev/null
+++ b/Unicode/gen-case
@@ -0,0 +1,137 @@
+#! /usr/bin/perl
+use strict;
+use warnings;
+
+my @trans;
+
+print "\t/* -*- c -*- source code generated by ", join(" ", $0, @ARGV), " */\n";
+while (<>) {
+ s/#.*$//;
+ next if /^\s*$/;
+ my($code, $status, $mapping) = /^([[:xdigit:]]+);\s*([CFST]);\s*([[:xdigit:]]+(?:\s+[[:xdigit:]]+)*);\s*$/
+ or warn("$ARGV:$.: weird line\n"), next;
+ next unless $status eq "C" or $status eq "S";
+ warn("$ARGV:$.: multi-char simple mapping\n"), next
+ if $mapping =~ /\s/;
+ $code = hex($code);
+ $mapping = hex($mapping);
+ $trans[$code] = $mapping;
+} continue {
+ close ARGV if eof;
+}
+
+sub gobble {
+ my($begin, $step) = @_;
+ my $diff = $trans[$begin] - $begin;
+ my @codes;
+ my @holes;
+ my $probe = $begin;
+ my $hole;
+ while (1) {
+ my @beyond;
+ while (defined($trans[$probe]) && $trans[$probe] == $probe + $diff) {
+ push @beyond, $probe;
+ $probe += $step;
+ }
+ last unless @beyond >= 2;
+ push @holes, $hole if defined $hole;
+ push @codes, @beyond;
+ $hole = $probe;
+ $probe += $step;
+ }
+ return 0 unless @codes;
+
+ # The following formula was tuned for i486-linux-gnu-gcc-4.0 -O1.
+ if (@codes <= 2 + @holes) {
+ print "if (", join(" || ", map { sprintf("c == 0x%X", $_) } @codes), ")\n";
+ } else {
+ printf "if (c >= 0x%X && c <= 0x%X", $codes[0], $codes[-1];
+ printf " && c != 0x%X", $_ foreach @holes;
+ if ($step == 2) { printf " && (c & 1) == %d", $begin & 1 }
+ elsif ($step != 1) { printf " && c %% %d == %d", $step, $begin % $step }
+ print ")\n";
+ }
+ if ($diff != 0) {
+ if ($diff < 0) { printf "\t\tc -= "; $diff = -$diff }
+ else { printf "\t\tc += " }
+ if ($diff < 10) { printf "%d", $diff }
+ else { printf "0x%X", $diff }
+ }
+ print ";\n";
+
+ undef $trans[$_] foreach @codes;
+ return 1;
+}
+
+my $first = 1;
+for (my $code = 0; $code <= $#trans; ++$code) {
+ next unless defined $trans[$code];
+
+ print $first ? "\t" : "\telse ";
+ gobble($code, 1) or gobble($code, 2) or gobble($code, 3) or gobble($code, 4)
+ or printf "if (c == 0x%X)\n\t\tc = 0x%X;\n", $code, $trans[$code];
+ $first = 0;
+}
+close STDOUT or die "$0: -: $!\n";
+
+__END__
+
+=head1 NAME
+
+gen-case - Generate C source code for folding the case of a Unicode character.
+
+=head1 SYNOPSIS
+
+B<gen-case> CaseFolding.txt > ../src/intl/casefold.inc
+
+=head1 DESCRIPTION
+
+B<gen-case> reads F<CaseFolding.txt> of the Unicode Character Database
+and generates C source code that implements the I<simple case folding>
+as defined in that file.
+
+The generated source code can then be used like this:
+
+ unicode_val_T
+ unicode_simple_case_fold(unicode_val_T c)
+ {
+ #include "casefold.inc"
+ return c;
+ }
+
+=head1 BUGS
+
+Does not support B<--help> nor B<--version>.
+
+=head1 AUTHOR
+
+Kalle Olavi Niemitalo <[EMAIL PROTECTED]>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (c) 2006 Kalle Olavi Niemitalo.
+
+This program is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself. In addition:
+
+=over
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+=back
diff --git a/src/intl/charsets.c b/src/intl/charsets.c
index 31905ba..dc8d25e 100644
--- a/src/intl/charsets.c
+++ b/src/intl/charsets.c
@@ -10,6 +10,9 @@ #endif
#include <ctype.h>
#include <stdlib.h>
+#if HAVE_WCTYPE_H
+#include <wctype.h>
+#endif
#include "elinks.h"
@@ -404,6 +407,27 @@ unicode_to_cell(unicode_val_T c)
return 1;
}
+/* Fold the case of a Unicode character, so that hotkeys in labels can
+ * be compared case-insensitively. This should be called only if
+ * check_kbd_label_key(c) is true. It is unspecified whether the
+ * result will be in upper or lower case. */
+unicode_val_T
+unicode_fold_label_case(unicode_val_T c)
+{
+#if __STDC_ISO_10646__ && HAVE_WCTYPE_H
+ return towlower(c);
+#else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
+ /* For now, this supports only ASCII. It would be possible to
+ * use code generated from CaseFolding.txt of Unicode if the
+ * acknowledgements required by http://www.unicode.org/copyright.html
+ * were added to associated documentation of ELinks. */
+ if (c >= 0x41 && c <= 0x5A)
+ return c + 0x20;
+ else
+ return c;
+#endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
+}
+
inline unicode_val_T
utf_8_to_unicode(unsigned char **string, unsigned char *end)
{
diff --git a/src/intl/charsets.h b/src/intl/charsets.h
index 8d11707..ae8fe97 100644
--- a/src/intl/charsets.h
+++ b/src/intl/charsets.h
@@ -62,6 +62,7 @@ int utf8_ptr2cells(unsigned char *, unsi
int utf8_ptr2chars(unsigned char *, unsigned char *);
int utf8_cells2bytes(unsigned char *, int, unsigned char *);
inline int unicode_to_cell(unicode_val_T);
+unicode_val_T unicode_fold_label_case(unicode_val_T);
inline int strlen_utf8(unsigned char **);
inline unicode_val_T utf_8_to_unicode(unsigned char **, unsigned char *);
unicode_val_T cp2u(int, unsigned char);
pgpBodM7QLw4R.pgp
Description: PGP signature
_______________________________________________ elinks-dev mailing list [email protected] http://linuxfromscratch.org/mailman/listinfo/elinks-dev
