UTF-8: New function unicode_fold_label_case and a related script.

---
commit 9e1fcebe33cf251fea537c9a06107004d9ba729b
tree ebff9dc7b5806a2e741dcc8bb379c42ac35f6796
parent 49f5b0819eefd68261ea2f2aa2bbca166dd0ae4a
author Kalle Olavi Niemitalo <[EMAIL PROTECTED]> Sat, 05 Aug 2006 19:45:53 +0300
committer Kalle Olavi Niemitalo <[EMAIL PROTECTED]> Sat, 05 Aug 2006 20:35:02 +0300

 Unicode/gen-case    |  137 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/intl/charsets.c |   24 +++++++++
 src/intl/charsets.h |    1 
 3 files changed, 162 insertions(+), 0 deletions(-)

diff --git a/Unicode/gen-case b/Unicode/gen-case
new file mode 100755
index 0000000..e67037b
--- /dev/null
+++ b/Unicode/gen-case
@@ -0,0 +1,137 @@
+#! /usr/bin/perl
+use strict;
+use warnings;
+
+my @trans;
+
+print "\t/* -*- c -*- source code generated by ", join(" ", $0, @ARGV), " */\n";
+while (<>) {
+    s/#.*$//;
+    next if /^\s*$/;
+    my($code, $status, $mapping) = /^([[:xdigit:]]+);\s*([CFST]);\s*([[:xdigit:]]+(?:\s+[[:xdigit:]]+)*);\s*$/
+	or warn("$ARGV:$.: weird line\n"), next;
+    next unless $status eq "C" or $status eq "S";
+    warn("$ARGV:$.: multi-char simple mapping\n"), next
+	if $mapping =~ /\s/;
+    $code = hex($code);
+    $mapping = hex($mapping);
+    $trans[$code] = $mapping;
+} continue {
+    close ARGV if eof;
+}
+
+sub gobble {
+    my($begin, $step) = @_;
+    my $diff = $trans[$begin] - $begin;
+    my @codes;
+    my @holes;
+    my $probe = $begin;
+    my $hole;
+    while (1) {
+	my @beyond;
+	while (defined($trans[$probe]) && $trans[$probe] == $probe + $diff) {
+	    push @beyond, $probe;
+	    $probe += $step;
+	}
+	last unless @beyond >= 2;
+	push @holes, $hole if defined $hole;
+	push @codes, @beyond;
+	$hole = $probe;
+	$probe += $step;
+    }
+    return 0 unless @codes;
+
+    # The following formula was tuned for i486-linux-gnu-gcc-4.0 -O1.
+    if (@codes <= 2 + @holes) {
+	print "if (", join(" || ", map { sprintf("c == 0x%X", $_) } @codes), ")\n";
+    } else {
+	printf "if (c >= 0x%X && c <= 0x%X", $codes[0], $codes[-1];
+	printf " && c != 0x%X", $_ foreach @holes;
+	if ($step == 2) { printf " && (c & 1) == %d", $begin & 1 }
+	elsif ($step != 1) { printf " && c %% %d == %d", $step, $begin % $step }
+	print ")\n";
+    }
+    if ($diff != 0) {
+	if ($diff < 0) { printf "\t\tc -= "; $diff = -$diff }
+	else { printf "\t\tc += " }
+	if ($diff < 10) { printf "%d", $diff }
+	else { printf "0x%X", $diff }
+    }
+    print ";\n";
+
+    undef $trans[$_] foreach @codes;
+    return 1;
+}
+
+my $first = 1;
+for (my $code = 0; $code <= $#trans; ++$code) {
+    next unless defined $trans[$code];
+
+    print $first ? "\t" : "\telse ";
+    gobble($code, 1) or gobble($code, 2) or gobble($code, 3) or gobble($code, 4)
+	or printf "if (c == 0x%X)\n\t\tc = 0x%X;\n", $code, $trans[$code];
+    $first = 0;
+}
+close STDOUT or die "$0: -: $!\n";
+
+__END__
+
+=head1 NAME
+
+gen-case - Generate C source code for folding the case of a Unicode character.
+
+=head1 SYNOPSIS
+
+B<gen-case> CaseFolding.txt > ../src/intl/casefold.inc
+
+=head1 DESCRIPTION
+
+B<gen-case> reads F<CaseFolding.txt> of the Unicode Character Database
+and generates C source code that implements the I<simple case folding>
+as defined in that file.
+
+The generated source code can then be used like this:
+
+  unicode_val_T
+  unicode_simple_case_fold(unicode_val_T c)
+  {
+  #include "casefold.inc"
+          return c;
+  }
+
+=head1 BUGS
+
+Does not support B<--help> nor B<--version>.
+
+=head1 AUTHOR
+
+Kalle Olavi Niemitalo <[EMAIL PROTECTED]>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (c) 2006 Kalle Olavi Niemitalo.
+
+This program is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself.  In addition:
+
+=over
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+=back
diff --git a/src/intl/charsets.c b/src/intl/charsets.c
index 31905ba..dc8d25e 100644
--- a/src/intl/charsets.c
+++ b/src/intl/charsets.c
@@ -10,6 +10,9 @@ #endif
 
 #include <ctype.h>
 #include <stdlib.h>
+#if HAVE_WCTYPE_H
+#include <wctype.h>
+#endif
 
 #include "elinks.h"
 
@@ -404,6 +407,27 @@ unicode_to_cell(unicode_val_T c)
 	return 1;
 }
 
+/* Fold the case of a Unicode character, so that hotkeys in labels can
+ * be compared case-insensitively.  This should be called only if
+ * check_kbd_label_key(c) is true.  It is unspecified whether the
+ * result will be in upper or lower case.  */
+unicode_val_T
+unicode_fold_label_case(unicode_val_T c)
+{
+#if __STDC_ISO_10646__ && HAVE_WCTYPE_H
+	return towlower(c);
+#else  /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
+	/* For now, this supports only ASCII.  It would be possible to
+	 * use code generated from CaseFolding.txt of Unicode if the
+	 * acknowledgements required by http://www.unicode.org/copyright.html
+	 * were added to associated documentation of ELinks.  */
+	if (c >= 0x41 && c <= 0x5A)
+		return c + 0x20;
+	else
+		return c;
+#endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
+}
+
 inline unicode_val_T
 utf_8_to_unicode(unsigned char **string, unsigned char *end)
 {
diff --git a/src/intl/charsets.h b/src/intl/charsets.h
index 8d11707..ae8fe97 100644
--- a/src/intl/charsets.h
+++ b/src/intl/charsets.h
@@ -62,6 +62,7 @@ int utf8_ptr2cells(unsigned char *, unsi
 int utf8_ptr2chars(unsigned char *, unsigned char *);
 int utf8_cells2bytes(unsigned char *, int, unsigned char *);
 inline int unicode_to_cell(unicode_val_T);
+unicode_val_T unicode_fold_label_case(unicode_val_T);
 inline int strlen_utf8(unsigned char **);
 inline unicode_val_T utf_8_to_unicode(unsigned char **, unsigned char *);
 unicode_val_T cp2u(int, unsigned char);

Attachment: pgpBodM7QLw4R.pgp
Description: PGP signature

_______________________________________________
elinks-dev mailing list
[email protected]
http://linuxfromscratch.org/mailman/listinfo/elinks-dev

Reply via email to