We discussed this patch off list and are going to leave it for a future release. But I figured I would post it here for others to try and so I do not lose it.
The patch handles multi-byte characters when invoking 'uniq --ignore-case' while perserving performance in the case of LC_ALL=C and the case without --ignore-case. $ yes abcdefghijklmnopqrstuvwxyz | head -n 10000000 > test.txt $ export LC_ALL=en_US.UTF-8 $ time ./src/uniq-new test.txt real 0m0.420s $ time ./src/uniq-new --ignore-case test.txt real 0m0.761s $ export LC_ALL=C $ time ./src/uniq-new test.txt real 0m0.425s $ time ./src/uniq-new --ignore-case test.txt real 0m0.485s $ export LC_ALL=en_US.UTF-8 $ time ./src/uniq-old test.txt real 0m0.420s $ time ./src/uniq-old --ignore-case test.txt real 0m0.437s $ export LC_ALL=C $ time ./src/uniq-old test.txt real 0m0.416s $ time ./src/uniq-old --ignore-case test.txt real 0m0.626s Collin
>From d93fda0413336267e1987683ce4f4778265e1b5f Mon Sep 17 00:00:00 2001 Message-ID: <d93fda0413336267e1987683ce4f4778265e1b5f.1757188059.git.collin.fu...@gmail.com> From: Collin Funk <collin.fu...@gmail.com> Date: Sat, 6 Sep 2025 12:30:20 -0700 Subject: [PATCH] uniq: support multi-byte characters with --ignore-case * bootstrap.conf (gnulib_modules): Add c32tolower. * src/uniq.c (different): Use mcel functions to scan the characters and compare using c32tolower only if MB_CUR_MAX is greater than 1. * tests/local.mk: Add it. * tests/uniq/uniq-ignorecase.sh (all_tests): Add it. --- bootstrap.conf | 1 + src/uniq.c | 23 +++++++-- tests/local.mk | 1 + tests/uniq/uniq-ignorecase.sh | 89 +++++++++++++++++++++++++++++++++++ 4 files changed, 111 insertions(+), 3 deletions(-) create mode 100755 tests/uniq/uniq-ignorecase.sh diff --git a/bootstrap.conf b/bootstrap.conf index 03848e9ea..4453607ef 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -52,6 +52,7 @@ gnulib_modules=" c-strcase c32iscntrl c32isspace + c32tolower c32width canon-host canonicalize diff --git a/src/uniq.c b/src/uniq.c index 9aa780574..6f0a02969 100644 --- a/src/uniq.c +++ b/src/uniq.c @@ -284,10 +284,27 @@ find_field (struct linebuffer const *line, idx_t *plen) static bool different (char *old, char *new, idx_t oldlen, idx_t newlen) { - if (ignore_case) - return oldlen != newlen || memcasecmp (old, new, oldlen); + if (1 < MB_CUR_MAX && ignore_case) + { + char *old_lim = old + oldlen; + char *new_lim = new + newlen; + for (mcel_t g1, g2; old < old_lim && new < new_lim; + old += g1.len, new += g2.len) + { + g1 = mcel_scan (old, old_lim); + g2 = mcel_scan (new, new_lim); + if (mcel_tocmp (c32tolower, g1, g2) != 0) + return true; + } + return (old < old_lim) != (new < new_lim); + } else - return oldlen != newlen || memcmp (old, new, oldlen); + { + if (ignore_case) + return oldlen != newlen || memcasecmp (old, new, oldlen); + else + return oldlen != newlen || memcmp (old, new, oldlen); + } } /* Output the line in linebuffer LINE to standard output diff --git a/tests/local.mk b/tests/local.mk index a42a20fbe..ab7a40623 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -473,6 +473,7 @@ all_tests = \ tests/uniq/uniq.pl \ tests/uniq/uniq-perf.sh \ tests/uniq/uniq-collate.sh \ + tests/uniq/uniq-ignorecase.sh \ tests/misc/xattr.sh \ tests/misc/yes.sh \ tests/tail/wait.sh \ diff --git a/tests/uniq/uniq-ignorecase.sh b/tests/uniq/uniq-ignorecase.sh new file mode 100755 index 000000000..525334076 --- /dev/null +++ b/tests/uniq/uniq-ignorecase.sh @@ -0,0 +1,89 @@ +#!/bin/sh +# Test uniq --ignore-case + +# Copyright (C) 2025 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ uniq printf + +# abc +# ABC +env printf 'abc\nABC\n' > inp || framework_failure_ +env printf 'abc\n' > exp || framework_failure_ +uniq --ignore-case inp > out || fail=1 +compare exp out || fail=1 + +# ABC +# abc +env printf 'ABC\nabc\n' > inp || framework_failure_ +env printf 'ABC\n' > exp || framework_failure_ +uniq --ignore-case inp > out || fail=1 +compare exp out || fail=1 + +test "$LOCALE_FR_UTF8" != none || skip_ "French UTF-8 locale not available" + +LC_ALL=$LOCALE_FR_UTF8 +export LC_ALL + +# президент +# ПРЕЗИДЕНТ +env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\u0442\n' > inp \ + || framework_failure +env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' >> inp \ + || framework_failure +env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\u0442\n' > exp \ + || framework_failure +uniq --ignore-case inp > out || fail=1 +compare exp out || fail=1 + +# ПРЕЗИДЕНТ +# президент +env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' > inp \ + || framework_failure +env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\u0442\n' >> inp \ + || framework_failure +env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' > exp \ + || framework_failure +uniq --ignore-case inp > out || fail=1 +compare exp out || fail=1 + +# президен +# ПРЕЗИДЕНТ +env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\n' > inp \ + || framework_failure +env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' >> inp \ + || framework_failure +env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\n' > exp \ + || framework_failure +env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' >> exp \ + || framework_failure +uniq --ignore-case inp > out || fail=1 +compare exp out || fail=1 + +# ПРЕЗИДЕНТ +# президен +env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' > inp \ + || framework_failure +env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\n' >> inp \ + || framework_failure +env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' > exp \ + || framework_failure +env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\n' >> exp \ + || framework_failure +uniq --ignore-case inp > out || fail=1 +compare exp out || fail=1 + +Exit $fail -- 2.51.0