> From:Ludovic Courtès <l...@gnu.org>

> > I know of two categories of bugs.  One has to do with case conversions
> > and case-insensitive comparisons, which must be done on entire strings
> > but are currently done for each character.  Here are some examples:
> >
> >   (string-upcase "Straße")         => "STRAßE"  
> (should be "STRASSE")
> >   (string-downcase "ΧΑΟΣΣ")        => "χαοσσ"  
> (should be "χαoσς")
> >   (string-downcase "ΧΑΟΣ Σ")       => "χαοσ σ"  
> (should be "χαoς σ")
> >   (string-ci=? "Straße" "Strasse") => #f        
> (should be #t)
> >   (string-ci=? "ΧΑΟΣ" "χαoσ")      => #f        
> (should be #t)
> 
> (Mike pointed out that SRFI-13 does not consider these bugs, but that’s
> linguistically wrong so I’d consider it a bug.  Note that all these
> functions are ‘linguistically buggy’ anyway since they don’t have a
> locale argument, which breaks with Turkish ‘İ’.)
> 
> Can we first check what would need to be done to fix this in 2.0.x?
> 
> At first glance:
> 
>   - “Straße” is normally stored as a Latin1 string, so it would need to
>     be converted to UTF-* before it can be passed to one of the
>     unicase.h functions.  *Or*, we could check with bug-libunistring
>     what it would take to add Latin1 string case mapping functions.
> 
>     Interestingly, ‘ß’ is the only Latin1 character that doesn’t have a
>     one-to-one case mapping.  All other Latin1 strings can be handled by
>     iterating over characters, as is currently done.

There is the micro sign, which, when case folded, becomes a Greek mu.
It is still a single character, but, it is the only latin-1 character that,
when folded, becomes a non-Latin-1 character

> 
>     With this in mind, we could hack our way so that strings that
>     contain an ‘ß’ are stored as UTF-32 (yes, that’s a hack.)
> 
>   - For ‘string-downcase’, the Greek strings above are wide strings, so
>     they can be passed directly to u32_toupper & co.  For these, the fix
>     is almost two lines.
> 
>   - Case insensitive comparison is more difficult, as you already
>     pointed out.  To do it right we’d probably need to convert Latin1
>     strings to UTF-32 and then pass it to u32_casecmp.  We don’t have to
>     do the conversion every time, though: we could just change Latin1
>     strings in-place so they now point to a wide stringbuf upon the
>     first ‘string-ci=’.
> 
> Thoughts?

What about the srfi-13 case insensitive comparisons (the ones that don't
terminate in question marks, like string-ci<)?  Should they remain
as srfi-13 suggests, or should they remain similar in behavior
to the question-mark-terminated comparisons?

Mark is right that fixing this will not be pretty.  The case insensitive
string comparisons, for example, could be patched like the attached
snippet. If you don't find it too ugly of an approach, I could work on
a real patch.

Thanks,

Mike
diff --git a/libguile/strorder.c b/libguile/strorder.c
index a51ce17..6df8343 100644
--- a/libguile/strorder.c
+++ b/libguile/strorder.c
@@ -21,6 +21,8 @@
 # include <config.h>
 #endif
 
+#include <unicase.h>
+
 #include "libguile/_scm.h"
 #include "libguile/chars.h"
 #include "libguile/strings.h"
@@ -42,6 +44,164 @@ srfi13_cmp (SCM s1, SCM s2, SCM (*cmp) (SCM, SCM, SCM, SCM, SCM, SCM))
     return SCM_BOOL_F;
 }
 
+#define SHARP_S (0xdf)
+#define MICRO_SIGN (0xb5)
+#define MU (0x3bc)
+/* This function compares does a comparison of the case-folded
+   versions of S1 and S2. It returns -1 if S1 < S2, 0 if they are equal
+   or 1 if S1 > S2. */
+static int
+compare_folded_strings (SCM s1, SCM s2)
+{
+  if (!scm_is_string (s1))
+    scm_wrong_type_arg (__func__, 0, s1);
+  if (!scm_is_string (s2))
+    scm_wrong_type_arg (__func__, 1, s2);
+  if (scm_i_is_narrow_string (s1) && scm_i_is_narrow_string (s2))
+    {
+      size_t cindex1 = 0, cindex2 = 0;
+      const size_t cend1 = scm_i_string_length (s1);
+      const size_t cend2 = scm_i_string_length (s2);
+      ucs4_t a, b;
+      int ss1 = 0, ss2 = 0;
+  
+      /* For narrow strings, folding equals downcasing except for sharp
+	 s which becomes 'ss' and the micro sign which becomes Greek
+	 mu.  */
+      while (cindex1 < cend1 && cindex2 < cend2)
+	{
+	  if (ss1)
+	    a = (ucs4_t) 's';
+	  else
+	    {
+	      a = uc_tolower ((unsigned char) (scm_i_string_chars (s1))[cindex1]);
+	      if (a == SHARP_S)
+		{
+		  a = (ucs4_t) 's';
+		  ss1 = 2;
+		}
+	      if (a == MICRO_SIGN)
+		a = MU;
+	    }
+	  if (ss2)
+	    b = (ucs4_t) 's';
+	  else
+	    {
+	      b = uc_tolower ((unsigned char) (scm_i_string_chars (s2))[cindex2]);
+	      if (b == SHARP_S)
+		{
+		  b = 's';
+		  ss2 = 2;
+		}
+	      if (b == MICRO_SIGN)
+		b = MU;
+	    }
+	  if (a < b)
+	    return -1;
+	  else if (a > b)
+	    return 1;
+	  if (ss1)
+	    ss1 --;
+	  if (!ss1)
+	    cindex1 ++;
+	  if (ss2)
+	    ss2 --;
+	  if (!ss2)
+	    cindex2 ++;
+	}
+      if (cindex1 < cend1)
+	return 1;
+      else if (cindex2 < cend2)
+	return -1;
+
+      return 0;
+    }
+  else if (!scm_i_is_narrow_string (s1) && !scm_i_is_narrow_string (s2))
+    {
+      int ret, result;
+
+      ret = u32_casecmp ((const uint32_t *) scm_i_string_wide_chars (s1),
+			 scm_i_string_length (s1),
+			 (const uint32_t *) scm_i_string_wide_chars (s2),
+			 scm_i_string_length (s2),
+			 NULL, NULL, &result);
+      if (ret != 0)
+        scm_encoding_error (__func__, errno,
+			    "cannot do case-folded comparison",
+			    SCM_BOOL_F,
+			    /* FIXME: Faulty character unknown.  */
+			    SCM_BOOL_F);
+      return result;
+    }
+  else
+    {
+      int swap = 1, ss1 = 0;
+      uint32_t *str2 = NULL;
+      size_t cindex1 = 0, cindex2 = 0;
+      const size_t cend1 = scm_i_string_length (s1);
+      size_t cend2;
+      ucs4_t a, b;
+
+      /* Swap so that s1 is narrow and s2 is wide.  */
+      if (scm_i_is_narrow_string (s2))
+	{
+	  SCM s3;
+	  s3 = s1;
+	  s1 = s2;
+	  s2 = s3;
+	  swap = -1;
+	}
+      str2 = u32_casefold ((const uint32_t *) scm_i_string_wide_chars (s2),
+			   scm_i_string_length (s2),
+			   NULL, NULL, NULL, &cend2);
+      if (str2 == NULL)
+	scm_memory_error (__func__);
+
+      while (cindex1 < cend1 && cindex2 < cend2)
+	{
+	  if (ss1)
+	    a = (ucs4_t) 's';
+	  else
+	    {
+	      a = uc_tolower ((unsigned char) scm_i_string_chars (s1)[cindex1]);
+	      if (a == SHARP_S)
+		{
+		  a = (ucs4_t) 's';
+		  ss1 = 2;
+		}
+	      if (a == MICRO_SIGN)
+		a = MU;
+	    }
+	  b = str2[cindex2];
+	  if (a < b)
+	    {
+	      free (str2);
+	      return -1 * swap;
+	    }
+	  else if (a > b)
+	    {
+	      free (str2);
+	      return 1 * swap;
+	    }
+	  if (ss1)
+	    ss1 --;
+	  if (!ss1) 
+	    cindex1 ++;
+	  cindex2 ++;
+	}
+      free (str2);
+      if (cindex1 < cend1)
+	return -1 * swap;
+      else if (cindex2 > cend2)
+	return 1 * swap;
+
+      return 0;
+    }
+      
+  return 0;
+}
+
+
 static SCM scm_i_string_equal_p (SCM s1, SCM s2, SCM rest);
 SCM_DEFINE (scm_i_string_equal_p, "string=?", 0, 2, 1,
             (SCM s1, SCM s2, SCM rest),
@@ -80,8 +240,8 @@ static SCM scm_i_string_ci_equal_p (SCM s1, SCM s2, SCM rest);
 SCM_DEFINE (scm_i_string_ci_equal_p, "string-ci=?", 0, 2, 1,
             (SCM s1, SCM s2, SCM rest),
 	    "Case-insensitive string equality predicate; return @code{#t} if\n"
-	    "the two strings are the same length and their component\n"
-	    "characters match (ignoring case) at each position; otherwise\n"
+	    "case-folded versions of the the two strings are the same length\n"
+            "and their component characters match at each position; otherwise\n"
 	    "return @code{#f}.")
 #define FUNC_NAME s_scm_i_string_ci_equal_p
 {
@@ -89,13 +249,13 @@ SCM_DEFINE (scm_i_string_ci_equal_p, "string-ci=?", 0, 2, 1,
     return SCM_BOOL_T;
   while (!scm_is_null (rest))
     {
-      if (scm_is_false (srfi13_cmp (s1, s2, scm_string_ci_eq)))
+      if (0 != compare_folded_strings (s1, s2))
         return SCM_BOOL_F;
       s1 = s2;
       s2 = scm_car (rest);
       rest = scm_cdr (rest);
     }
-  return srfi13_cmp (s1, s2, scm_string_ci_eq);
+  return scm_from_bool (0 == compare_folded_strings (s1, s2));
 }
 #undef FUNC_NAME
 
@@ -218,6 +378,7 @@ SCM scm_string_geq_p (SCM s1, SCM s2)
 }
 #undef FUNC_NAME
 
+
 static SCM scm_i_string_ci_less_p (SCM s1, SCM s2, SCM rest);
 SCM_DEFINE (scm_i_string_ci_less_p, "string-ci<?", 0, 2, 1,
             (SCM s1, SCM s2, SCM rest),
@@ -230,20 +391,20 @@ SCM_DEFINE (scm_i_string_ci_less_p, "string-ci<?", 0, 2, 1,
     return SCM_BOOL_T;
   while (!scm_is_null (rest))
     {
-      if (scm_is_false (srfi13_cmp (s1, s2, scm_string_ci_lt)))
+      if (-1 != compare_folded_strings (s1, s2))
         return SCM_BOOL_F;
       s1 = s2;
       s2 = scm_car (rest);
       rest = scm_cdr (rest);
     }
-  return srfi13_cmp (s1, s2, scm_string_ci_lt);
+  return scm_from_bool (-1 == compare_folded_strings (s1, s2));
 }
 #undef FUNC_NAME
 
 SCM scm_string_ci_less_p (SCM s1, SCM s2)
 #define FUNC_NAME s_scm_i_string_ci_less_p
 {
-  return srfi13_cmp (s1, s2, scm_string_ci_lt);
+  return scm_from_bool (-1 == compare_folded_strings (s1, s2));
 }
 #undef FUNC_NAME
 
@@ -259,20 +420,20 @@ SCM_DEFINE (scm_i_string_ci_leq_p, "string-ci<=?", 0, 2, 1,
     return SCM_BOOL_T;
   while (!scm_is_null (rest))
     {
-      if (scm_is_false (srfi13_cmp (s1, s2, scm_string_ci_le)))
+      if (1 == compare_folded_strings (s1, s2))
         return SCM_BOOL_F;
       s1 = s2;
       s2 = scm_car (rest);
       rest = scm_cdr (rest);
     }
-  return srfi13_cmp (s1, s2, scm_string_ci_le);
+  return scm_from_bool (1 != compare_folded_strings (s1, s2));
 }
 #undef FUNC_NAME
 
 SCM scm_string_ci_leq_p (SCM s1, SCM s2)
 #define FUNC_NAME s_scm_i_string_ci_leq_p
 {
-  return srfi13_cmp (s1, s2, scm_string_ci_le);
+  return scm_from_bool (1 != compare_folded_strings (s1, s2));
 }
 #undef FUNC_NAME
 
@@ -288,13 +449,13 @@ SCM_DEFINE (scm_i_string_ci_gr_p, "string-ci>?", 0, 2, 1,
     return SCM_BOOL_T;
   while (!scm_is_null (rest))
     {
-      if (scm_is_false (srfi13_cmp (s1, s2, scm_string_ci_gt)))
+      if (1 != compare_folded_strings (s1, s2))
         return SCM_BOOL_F;
       s1 = s2;
       s2 = scm_car (rest);
       rest = scm_cdr (rest);
     }
-  return srfi13_cmp (s1, s2, scm_string_ci_gt);
+  return scm_from_bool (1 == compare_folded_strings (s1, s2));
 }
 #undef FUNC_NAME
 
@@ -317,20 +478,20 @@ SCM_DEFINE (scm_i_string_ci_geq_p, "string-ci>=?", 0, 2, 1,
     return SCM_BOOL_T;
   while (!scm_is_null (rest))
     {
-      if (scm_is_false (srfi13_cmp (s1, s2, scm_string_ci_ge)))
+      if (-1 == compare_folded_strings (s1, s2))
         return SCM_BOOL_F;
       s1 = s2;
       s2 = scm_car (rest);
       rest = scm_cdr (rest);
     }
-  return srfi13_cmp (s1, s2, scm_string_ci_ge);
+  return scm_from_bool (-1 != compare_folded_strings (s1, s2));
 }
 #undef FUNC_NAME
 
 SCM scm_string_ci_geq_p (SCM s1, SCM s2)
 #define FUNC_NAME s_scm_i_string_ci_geq_p
 {
-  return srfi13_cmp (s1, s2, scm_string_ci_ge);
+  return scm_from_bool (-1 != compare_folded_strings (s1, s2));
 }
 #undef FUNC_NAME
 

Reply via email to