We discussed this patch off list and are going to leave it for a future
release. But I figured I would post it here for others to try and so I
do not lose it.

The patch handles multi-byte characters when invoking
'uniq --ignore-case' while perserving performance in the case of
LC_ALL=C and the case without --ignore-case.

    $ yes abcdefghijklmnopqrstuvwxyz | head -n 10000000 > test.txt

    $ export LC_ALL=en_US.UTF-8 
    $ time ./src/uniq-new test.txt    
    real        0m0.420s
    $ time ./src/uniq-new --ignore-case test.txt
    real        0m0.761s

    $ export LC_ALL=C
    $ time ./src/uniq-new test.txt
    real        0m0.425s
    $ time ./src/uniq-new --ignore-case test.txt
    real        0m0.485s
    
    $ export LC_ALL=en_US.UTF-8 
    $ time ./src/uniq-old test.txt
    real        0m0.420s
    $ time ./src/uniq-old --ignore-case test.txt
    real        0m0.437s

    $ export LC_ALL=C
    $ time ./src/uniq-old test.txt
    real        0m0.416s
    $ time ./src/uniq-old --ignore-case test.txt
    real        0m0.626s

Collin

>From d93fda0413336267e1987683ce4f4778265e1b5f Mon Sep 17 00:00:00 2001
Message-ID: <d93fda0413336267e1987683ce4f4778265e1b5f.1757188059.git.collin.fu...@gmail.com>
From: Collin Funk <collin.fu...@gmail.com>
Date: Sat, 6 Sep 2025 12:30:20 -0700
Subject: [PATCH] uniq: support multi-byte characters with --ignore-case

* bootstrap.conf (gnulib_modules): Add c32tolower.
* src/uniq.c (different): Use mcel functions to scan the characters and
compare using c32tolower only if MB_CUR_MAX is greater than 1.
* tests/local.mk: Add it.
* tests/uniq/uniq-ignorecase.sh (all_tests): Add it.
---
 bootstrap.conf                |  1 +
 src/uniq.c                    | 23 +++++++--
 tests/local.mk                |  1 +
 tests/uniq/uniq-ignorecase.sh | 89 +++++++++++++++++++++++++++++++++++
 4 files changed, 111 insertions(+), 3 deletions(-)
 create mode 100755 tests/uniq/uniq-ignorecase.sh

diff --git a/bootstrap.conf b/bootstrap.conf
index 03848e9ea..4453607ef 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -52,6 +52,7 @@ gnulib_modules="
   c-strcase
   c32iscntrl
   c32isspace
+  c32tolower
   c32width
   canon-host
   canonicalize
diff --git a/src/uniq.c b/src/uniq.c
index 9aa780574..6f0a02969 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -284,10 +284,27 @@ find_field (struct linebuffer const *line, idx_t *plen)
 static bool
 different (char *old, char *new, idx_t oldlen, idx_t newlen)
 {
-  if (ignore_case)
-    return oldlen != newlen || memcasecmp (old, new, oldlen);
+  if (1 < MB_CUR_MAX && ignore_case)
+    {
+      char *old_lim = old + oldlen;
+      char *new_lim = new + newlen;
+      for (mcel_t g1, g2; old < old_lim && new < new_lim;
+           old += g1.len, new += g2.len)
+        {
+          g1 = mcel_scan (old, old_lim);
+          g2 = mcel_scan (new, new_lim);
+          if (mcel_tocmp (c32tolower, g1, g2) != 0)
+            return true;
+        }
+      return (old < old_lim) != (new < new_lim);
+    }
   else
-    return oldlen != newlen || memcmp (old, new, oldlen);
+    {
+      if (ignore_case)
+        return oldlen != newlen || memcasecmp (old, new, oldlen);
+      else
+        return oldlen != newlen || memcmp (old, new, oldlen);
+    }
 }
 
 /* Output the line in linebuffer LINE to standard output
diff --git a/tests/local.mk b/tests/local.mk
index a42a20fbe..ab7a40623 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -473,6 +473,7 @@ all_tests =					\
   tests/uniq/uniq.pl				\
   tests/uniq/uniq-perf.sh			\
   tests/uniq/uniq-collate.sh			\
+  tests/uniq/uniq-ignorecase.sh			\
   tests/misc/xattr.sh				\
   tests/misc/yes.sh				\
   tests/tail/wait.sh				\
diff --git a/tests/uniq/uniq-ignorecase.sh b/tests/uniq/uniq-ignorecase.sh
new file mode 100755
index 000000000..525334076
--- /dev/null
+++ b/tests/uniq/uniq-ignorecase.sh
@@ -0,0 +1,89 @@
+#!/bin/sh
+# Test uniq --ignore-case
+
+# Copyright (C) 2025 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ uniq printf
+
+# abc
+# ABC
+env printf 'abc\nABC\n' > inp || framework_failure_
+env printf 'abc\n' > exp || framework_failure_
+uniq --ignore-case inp > out || fail=1
+compare exp out || fail=1
+
+# ABC
+# abc
+env printf 'ABC\nabc\n' > inp || framework_failure_
+env printf 'ABC\n' > exp || framework_failure_
+uniq --ignore-case inp > out || fail=1
+compare exp out || fail=1
+
+test "$LOCALE_FR_UTF8" != none || skip_ "French UTF-8 locale not available"
+
+LC_ALL=$LOCALE_FR_UTF8
+export LC_ALL
+
+# президент
+# ПРЕЗИДЕНТ
+env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\u0442\n' > inp \
+  || framework_failure
+env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' >> inp \
+  || framework_failure
+env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\u0442\n' > exp \
+  || framework_failure
+uniq --ignore-case inp > out || fail=1
+compare exp out || fail=1
+
+# ПРЕЗИДЕНТ
+# президент
+env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' > inp \
+  || framework_failure
+env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\u0442\n' >> inp \
+  || framework_failure
+env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' > exp \
+  || framework_failure
+uniq --ignore-case inp > out || fail=1
+compare exp out || fail=1
+
+# президен
+# ПРЕЗИДЕНТ
+env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\n' > inp \
+  || framework_failure
+env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' >> inp \
+  || framework_failure
+env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\n' > exp \
+  || framework_failure
+env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' >> exp \
+  || framework_failure
+uniq --ignore-case inp > out || fail=1
+compare exp out || fail=1
+
+# ПРЕЗИДЕНТ
+# президен
+env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' > inp \
+  || framework_failure
+env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\n' >> inp \
+  || framework_failure
+env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' > exp \
+  || framework_failure
+env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\n' >> exp \
+  || framework_failure
+uniq --ignore-case inp > out || fail=1
+compare exp out || fail=1
+
+Exit $fail
-- 
2.51.0

Reply via email to