Package: locales Version: 2.35-1 Severity: normal X-Debbugs-Cc: t...@mirbsd.de
While adjusting my localedata patch script to the latest glibc uploads I discovered a surprising difference in some categories — for example: (sid-amd64)tglase@tglase:~ $ LC_ALL=C ./tstspc U+0009 U+000A U+000B U+000C U+000D U+0020 (sid-amd64)tglase@tglase:~ $ LC_ALL=C.UTF-8 ./tstspc U+0009 U+000A U+000B U+000C U+000D U+0020 U+1680 U+2000 U+2001 U+2002 U+2003 U+2004 U+2005 U+2006 U+2008 U+2009 U+200A U+2028 U+2029 U+205F U+3000 The test program is thus: gcc -O2 -Wall -Wextra -Wformat -o tstspc tstspc.c //--------------------------------cut-here------------------------------ #include <err.h> #include <stdio.h> #include <locale.h> #include <wctype.h> int main(void) { wint_t wc; if (!setlocale(LC_ALL, "")) err(1, "setlocale"); #define DOIT(lim, fmtstr) do { \ while (wc <= lim) { \ if (iswspace(wc)) \ printf(fmtstr, (unsigned int)wc); \ ++wc; \ } \ } while (/* CONSTCOND */ 0) wc = 0; DOIT(0xFFFF, "U+%04X\n"); DOIT(0x10FFFF, "U-%08X\n"); return (0); } //--------------------------------cut-here------------------------------ In my localedata patch script, I take specific care to change the copy of i18n_ctype before applying it to C.UTF-8 as follows: space → <U0009>..<U000D>;<U0020> cntrl → <U0000>..<U001F>;<U007F> blank → <U0009>;<U0020> They are as mandated by POSIX for the C locale. I believe I said in my original 2013 proposal for a C.UTF-8 locale that it should be as close to C as possible while using UTF-8 as encoding. For these curious, I’m attaching the current WIP of said script. -- System Information: Debian Release: bookworm/sid APT prefers unstable-debug APT policy: (500, 'unstable-debug'), (500, 'buildd-unstable'), (500, 'unstable'), (1, 'experimental') merged-usr: no Architecture: amd64 (x86_64) Kernel: Linux 5.10.0-10-amd64 (SMP w/4 CPU threads) Kernel taint flags: TAINT_FIRMWARE_WORKAROUND Locale: LANG=C, LC_CTYPE=C.UTF-8 (charmap=UTF-8), LANGUAGE not set Shell: /bin/sh linked to /bin/lksh Init: sysvinit (via /sbin/init) Versions of packages locales depends on: ii debconf [debconf-2.0] 1.5.79 ii libc-bin 2.35-1 ii libc-l10n 2.35-1 locales recommends no packages. locales suggests no packages. -- debconf information: locales/default_environment_locale: None locales/locales_to_be_generated:
# -*- mode: sh -*- #- # Copyright © 2017, 2020, 2022 # mirabilos <m...@mirbsd.org> # # Provided that these terms and disclaimer and all copyright notices # are retained or reproduced in an accompanying document, permission # is granted to deal in this work without restriction, including un‐ # limited rights to use, publicly perform, distribute, sell, modify, # merge, give away, or sublicence. # # This work is provided “AS IS” and WITHOUT WARRANTY of any kind, to # the utmost extent permitted by applicable law, neither express nor # implied; without malicious intent or gross negligence. In no event # may a licensor, author or contributor be held liable for indirect, # direct, other damage, loss, or other issues arising in any way out # of dealing in the work, even if advised of the possibility of such # damage or existence of a defect, except proven that it results out # of said person’s immediate fault when using the work as intended. #- # Installs UTF-8 charmap shipped by mirabilos-support generated from # MirBSD but glibc-compatible into the glibc locale souce directory, # to update wcwidth data (search for “Character width according to”) # and the i18n_ctype data taken off glibc git master to enable those # glyphs added in later UCS versions at all, to (find “It covers”) — # not always the same version the charmap ships, but close enough at # least — patching the C.UTF-8 and Turkish localedata appropriately, # handling i18n for stretch glibc and older as well. This also fixes # Turkish title-case i→İ and C.UTF-8 supporting UCD 9 only, not just # making ䷀ fullwidth again (bad width tablegens recently sprouted). case x$KSH_VERSION in x'@(#)MIRBSD KSH R'[5-9][0-9]*|x'@(#)MIRBSD KSH R'[1-9][0-9][0-9]*) ;; *) echo >&2 "E: need mksh" exit 255 ;; esac export LC_ALL=C unset LANGUAGE set -e set -o pipefail set +e localedef='localedef --no-archive -A /usr/share/locale/locale.alias' rv=0 do_localedef() { print -ru2 -- + $localedef "$@" $localedef "$@" || rv=1 } die() { print -ru2 -- "E: $*" exit 1 } set -x (( USER_ID )) && die need root cd "$(dirname "$0")" || die cannot chdir [[ -s UTF-8.gz ]] || die cannot find charmap file [[ -s i18n_ctype.gz ]] || die cannot find ctype file [[ -d /usr/share/i18n/charmaps/. ]] || die cannot find charmap directory [[ -d /usr/share/i18n/locales/. ]] || die cannot find locale directory install -c -o root -g root -m 644 UTF-8.gz /usr/share/i18n/charmaps/ || \ die cannot install charmap gzip -d <i18n_ctype.gz | sudo install -c -o root -g root -m 644 /dev/stdin \ /usr/share/i18n/locales/i18n_ctype || die cannot install i18n_ctype set +x lC='% The following is a copy of i18n_ctype with the following change:'$'\n' lC+='% - The "blank", "cntrl", "space" classes are defined as specified by POSIX'$'\n\n' lTR= lo= s=0 hits=0 hitc=0 hitb=0 while IFS= read -r line; do case $s:$line { (0:LC_CTYPE) s=1 ;; (1:'END LC_CTYPE') s=2 ;; ([13]:*) lo+=$line$'\n' ;| (1:'space /') lC+=$line$'\n' lTR+=$line$'\n' hits=1 lC+=' <U0009>..<U000D>;<U0020>'$'\n' s=3 ;; (1:'cntrl /') lC+=$line$'\n' lTR+=$line$'\n' hitc=1 lC+=' <U0000>..<U001F>;<U007F>'$'\n' s=3 ;; (1:'blank /') lC+=$line$'\n' lTR+=$line$'\n' hitb=1 lC+=' <U0009>;<U0020>'$'\n' s=3 ;; (3:) lC+=$line$'\n' lTR+=$line$'\n' s=1 ;; (3:*) lTR+=$line$'\n' ;; (1:*'(<U0069>,<U0049>)'*) lC+=$line$'\n' lTR+=${line/'(<U0069>,<U0049>)'/'(<U0069>,<U0130>)'}$'\n' ;; (1:*'(<U0049>,<U0069>)'*) lC+=$line$'\n' lTR+=${line/'(<U0049>,<U0069>)'/'(<U0049>,<U0131>)'}$'\n' ;; (1:toupper*) lTR+='% The case conversions reflect Turkish conventions.'$'\n' ;& (1:*) lC+=$line$'\n' lTR+=$line$'\n' ;; } done </usr/share/i18n/locales/i18n_ctype [[ $s$hits$hitc$hitb = 2111 ]] || die failed to parse i18n_ctype function locpatch { local fn=$1 head rest line s nameref mid=$2 set -e s=0 while IFS= read -r line; do case $s:$line { (0:LC_CTYPE) s=1 ;& (0:*) head+=$line$'\n' ;; (1:translit_start|1:'END LC_CTYPE'|1:'% Include the neutral transliterations. The builtin C and') s=2 ;& (2:*) rest+=$line$'\n' ;; } done <"$fn" [[ $s = 2 ]] || die "failed to patch $fn" print -nr -- "$head$mid$rest" >"$fn" } [[ ! -e /usr/share/i18n/locales/C ]] || \ locpatch /usr/share/i18n/locales/C lC || die could not patch C [[ ! -e /usr/share/i18n/locales/i18n ]] || \ locpatch /usr/share/i18n/locales/i18n lo || die could not patch i18n [[ ! -e /usr/share/i18n/locales/tr_TR ]] || \ locpatch /usr/share/i18n/locales/tr_TR lTR || die could not patch TR for dir in /usr/lib/locale/*.utf8/LC_CTYPE; do if test -h "$dir"; then print -ru2 "# $dir is a symbolic link, skipping" continue fi loc=${dir%/LC_CTYPE} loc=${loc##*/} do_localedef -i "${loc%.utf8}" -c -f UTF-8 $loc done if [[ -s /usr/lib/locale/C.UTF-8/LC_CTYPE ]]; then do_localedef -i C -c -f UTF-8 /usr/lib/locale/C.UTF-8 elif [[ -s /usr/lib/locale/C.utf8/LC_CTYPE ]]; then do_localedef -i C -c -f UTF-8 /usr/lib/locale/C.utf8 else print -ru2 -- W: no UTF-8 locale on this system (( rv |= 2 )) fi exit $rv