Hello all, I think a good bit of Linux From Scratch builders are aware of this, but Diffutils' handling of multibyte whitespace is broken.
As an example, this is a testcase for testing for i18n compliance (from LSB): echo "hello こんにちは ですね。 " > file1.txt echo "hello こんにちは ですね。 " > file2.txt diff -b file1.txt file2.txt (And note: the " " whitespace character is not two spaces, it needs to be copied in directly from your browser or email client into a UTF-8 terminal) Expected result: no output (because diff is set to ignore whitespace) Wrong result: This (with unpatched Diffutils): 1c1 < hello こんにちは ですね。 --- > hello こんにちは ですね。 To fix this problem, I have an attached patch from OpenSUSE that fixes whitespace handling in multibyte situations. It is below this message. -- William Immendorf The ultimate in free computing. Messages in plain text, please, no HTML. GPG key ID: 1697BE98 If it's not signed, it's not from me. -------------- "Every nonfree program has a lord, a master -- and if you use the program, he is your master." Richard Stallman (patch follows in LFS format, apply with -Np1) Submitted By: William Immendorf <[email protected]> Date: 2010-12-24 Initial Package Version: 3.0 Upstream Status: Unknown Origin: OpenSUSE Description: Fix handling of whitespace in multibyte situations. diff -Naur diffutils-3.0.orig/src/diff.c diffutils-3.0/src/diff.c --- diffutils-3.0.orig/src/diff.c 2010-12-24 10:58:40.376759001 -0600 +++ diffutils-3.0/src/diff.c 2010-12-24 10:58:54.303758920 -0600 @@ -284,6 +284,13 @@ re_set_syntax (RE_SYNTAX_GREP | RE_NO_POSIX_BACKTRACKING); excluded = new_exclude (); +#ifdef HANDLE_MULTIBYTE + if (MB_CUR_MAX > 1) + lines_differ = lines_differ_multibyte; + else +#endif + lines_differ = lines_differ_singlebyte; + /* Decode the options. */ while ((c = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1) diff -Naur diffutils-3.0.orig/src/diff.h diffutils-3.0/src/diff.h --- diffutils-3.0.orig/src/diff.h 2010-12-24 10:58:40.376759001 -0600 +++ diffutils-3.0/src/diff.h 2010-12-24 10:58:54.303758920 -0600 @@ -23,6 +23,17 @@ #include <stdio.h> #include <unlocked-io.h> +/* For platform which support the ISO C amendement 1 functionality we + support user defined character classes. */ +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H +/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */ +# include <wchar.h> +# include <wctype.h> +# if defined (HAVE_MBRTOWC) +# define HANDLE_MULTIBYTE 1 +# endif +#endif + /* What kind of changes a hunk contains. */ enum changes { @@ -350,7 +361,13 @@ extern char const pr_program[]; char *concat (char const *, char const *, char const *); char *dir_file_pathname (char const *, char const *); -bool lines_differ (char const *, char const *); + +bool (*lines_differ) (char const *, char const *); +bool lines_differ_singlebyte (char const *, char const *); +#ifdef HANDLE_MULTIBYTE +bool lines_differ_multibyte (char const *, char const *); +#endif + lin translate_line_number (struct file_data const *, lin); struct change *find_change (struct change *); struct change *find_reverse_change (struct change *); diff -Naur diffutils-3.0.orig/src/io.c diffutils-3.0/src/io.c --- diffutils-3.0.orig/src/io.c 2010-12-24 10:58:40.376759001 -0600 +++ diffutils-3.0/src/io.c 2010-12-24 10:58:54.304758927 -0600 @@ -22,6 +22,7 @@ #include <cmpbuf.h> #include <file-type.h> #include <xalloc.h> +#include <assert.h> /* Rotate an unsigned value to the left. */ #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n))) @@ -194,6 +195,28 @@ /* Split the file into lines, simultaneously computing the equivalence class for each line. */ +#ifdef HANDLE_MULTIBYTE +# define MBC2WC(P, END, MBLENGTH, WC, STATE, CONVFAIL) \ +do \ +{ \ + mbstate_t state_bak = STATE; \ + \ + CONVFAIL = 0; \ + MBLENGTH = mbrtowc (&WC, P, END - (char const *) P, &STATE); \ + \ + switch (MBLENGTH) \ + { \ + case (size_t) -2: \ + case (size_t) -1: \ + STATE = state_bak; \ + ++CONVFAIL; \ + /* Fall through. */ \ + case 0: \ + MBLENGTH = 1; \ + } \ +} \ +while (0) +#endif static void find_and_hash_each_line (struct file_data *current) @@ -220,11 +243,294 @@ bool same_length_diff_contents_compare_anyway = diff_length_compare_anyway | ignore_case; +#ifdef HANDLE_MULTIBYTE + wchar_t wc; + size_t mblength; + mbstate_t state; + int convfail; + + memset (&state, '\0', sizeof (mbstate_t)); +#endif + while (p < suffix_begin) { char const *ip = p; h = 0; +#ifdef HANDLE_MULTIBYTE + if (MB_CUR_MAX > 1) + { + wchar_t lo_wc; + char mbc[MB_LEN_MAX]; + mbstate_t state_wc; + + /* Hash this line until we find a newline. */ + switch (ignore_white_space) + { + case IGNORE_ALL_SPACE: + while (1) + { + if (*p == '\n') + { + ++p; + break; + } + + MBC2WC (p, suffix_begin, mblength, wc, state, convfail); + + if (convfail) + mbc[0] = *p++; + else if (!iswspace (wc)) + { + bool flag = 0; + + if (ignore_case) + { + lo_wc = towlower (wc); + if (lo_wc != wc) + { + flag = 1; + + p += mblength; + memset (&state_wc, '\0', sizeof (mbstate_t)); + mblength = wcrtomb (mbc, lo_wc, &state_wc); + + assert (mblength != (size_t) -1 && + mblength != (size_t) -2); + + mblength = mblength < 1 ? 1 : mblength; + } + } + + if (!flag) + { + for (i = 0; i < mblength; i++) + mbc[i] = *p++; + } + } + else + { + p += mblength; + continue; + } + + for (i = 0; i < mblength; i++) + { + c = mbc[i]; + h = HASH (h, c); + } + } + break; + + case IGNORE_SPACE_CHANGE: + while (1) + { + if (*p == '\n') + { + ++p; + break; + } + + MBC2WC (p, suffix_begin, mblength, wc, state, convfail); + + if (!convfail && iswspace (wc)) + { + while (1) + { + if (*p == '\n') + { + ++p; + goto hashing_done; + } + + p += mblength; + MBC2WC (p, suffix_begin, mblength, wc, state, convfail); + if (convfail || (!convfail && !iswspace (wc))) + break; + } + h = HASH (h, ' '); + } + + /* WC is now the first non-space. */ + if (convfail) + mbc[0] = *p++; + else + { + bool flag = 0; + + if (ignore_case) + { + lo_wc = towlower (wc); + if (lo_wc != wc) + { + flag = 1; + + p += mblength; + memset (&state_wc, '\0', sizeof (mbstate_t)); + mblength = wcrtomb (mbc, lo_wc, &state_wc); + + assert (mblength != (size_t) -1 && + mblength != (size_t) -2); + + mblength = mblength < 1 ? 1 : mblength; + } + } + + if (!flag) + { + for (i = 0; i < mblength; i++) + mbc[i] = *p++; + } + } + + for (i = 0; i < mblength; i++) + { + c = mbc[i]; + h = HASH (h, c); + } + } + break; + + case IGNORE_TAB_EXPANSION: + { + size_t column = 0; + + while (1) + { + if (*p == '\n') + { + ++p; + break; + } + + MBC2WC (p, suffix_begin, mblength, wc, state, convfail); + + if (convfail) + { + c = *p++; + h = HASH (h, c); + ++column; + } + else + { + bool flag; + + switch (wc) + { + case L'\b': + column -= 0 < column; + h = HASH (h, '\b'); + ++p; + break; + + case L'\t': + { + size_t repetitions; + repetitions = tabsize - column % tabsize; + column = (column + repetitions < column + ? 0 + : column + repetitions); + do + h = HASH (h, ' '); + while (--repetitions != 0); + ++p; + } + break; + + case L'\r': + column = 0; + h = HASH (h, '\r'); + ++p; + break; + + default: + flag = 0; + column += wcwidth (wc); + if (ignore_case) + { + lo_wc = towlower (wc); + if (lo_wc != wc) + { + flag = 1; + p += mblength; + memset (&state_wc, '\0', sizeof (mbstate_t)); + mblength = wcrtomb (mbc, lo_wc, &state_wc); + + assert (mblength != (size_t) -1 && + mblength != (size_t) -2); + + mblength = mblength < 1 ? 1 : mblength; + } + } + + if (!flag) + { + for (i = 0; i < mblength; i++) + mbc[i] = *p++; + } + + for (i = 0; i < mblength; i++) + { + c = mbc[i]; + h = HASH (h, c); + } + } + } + } + } + break; + + default: + while (1) + { + if (*p == '\n') + { + ++p; + break; + } + + MBC2WC (p, suffix_begin, mblength, wc, state, convfail); + + if (convfail) + mbc[0] = *p++; + else + { + int flag = 0; + + if (ignore_case) + { + lo_wc = towlower (wc); + if (lo_wc != wc) + { + flag = 1; + p += mblength; + memset (&state_wc, '\0', sizeof (mbstate_t)); + mblength = wcrtomb (mbc, lo_wc, &state_wc); + + assert (mblength != (size_t) -1 && + mblength != (size_t) -2); + + mblength = mblength < 1 ? 1 : mblength; + } + } + + if (!flag) + { + for (i = 0; i < mblength; i++) + mbc[i] = *p++; + } + } + + for (i = 0; i < mblength; i++) + { + c = mbc[i]; + h = HASH (h, c); + } + } + } + goto hashing_done; + } +#endif /* Hash this line until we find a newline. */ if (ignore_case) diff -Naur diffutils-3.0.orig/src/util.c diffutils-3.0/src/util.c --- diffutils-3.0.orig/src/util.c 2010-12-24 10:58:40.376759001 -0600 +++ diffutils-3.0/src/util.c 2010-12-24 10:58:54.304758927 -0600 @@ -317,7 +317,7 @@ Return nonzero if the lines differ. */ bool -lines_differ (char const *s1, char const *s2) +lines_differ_singlebyte (char const *s1, char const *s2) { register char const *t1 = s1; register char const *t2 = s2; @@ -446,6 +446,293 @@ return true; } + +#ifdef HANDLE_MULTIBYTE +# define MBC2WC(T, END, MBLENGTH, WC, STATE, CONVFAIL) \ +do \ +{ \ + mbstate_t bak = STATE; \ + \ + CONVFAIL = 0; \ + MBLENGTH = mbrtowc (&WC, T, END - T, &STATE); \ + \ + switch (MBLENGTH) \ + { \ + case (size_t)-2: \ + case (size_t)-1: \ + STATE = bak; \ + ++CONVFAIL; \ + /* Fall through. */ \ + case 0: \ + MBLENGTH = 1; \ + } \ +} \ +while (0) + +bool +lines_differ_multibyte (char const *s1, char const *s2) +{ + unsigned char const *end1, *end2; + unsigned char c1, c2; + wchar_t wc1, wc2, wc1_bak, wc2_bak; + size_t mblen1, mblen2; + mbstate_t state1, state2, state1_bak, state2_bak; + int convfail1, convfail2, convfail1_bak, convfail2_bak; + + unsigned char const *t1 = (unsigned char const *) s1; + unsigned char const *t2 = (unsigned char const *) s2; + unsigned char const *t1_bak, *t2_bak; + size_t column = 0; + + if (ignore_white_space == IGNORE_NO_WHITE_SPACE && !ignore_case) + { + while (*t1 != '\n') + if (*t1++ != * t2++) + return 1; + return 0; + } + + memset (&state1, '\0', sizeof (mbstate_t)); + memset (&state2, '\0', sizeof (mbstate_t)); + + end1 = s1 + strlen (s1); + end2 = s2 + strlen (s2); + + while (1) + { + c1 = *t1; + c2 = *t2; + MBC2WC (t1, end1, mblen1, wc1, state1, convfail1); + MBC2WC (t2, end2, mblen2, wc2, state2, convfail2); + + /* Test for exact char equality first, since it's a common case. */ + if (convfail1 ^ convfail2) + break; + else if (convfail1 && convfail2 && c1 != c2) + break; + else if (!convfail1 && !convfail2 && wc1 != wc2) + { + switch (ignore_white_space) + { + case IGNORE_ALL_SPACE: + /* For -w, just skip past any white space. */ + while (1) + { + if (convfail1) + break; + else if (wc1 == L'\n' || !iswspace (wc1)) + break; + + t1 += mblen1; + c1 = *t1; + MBC2WC (t1, end1, mblen1, wc1, state1, convfail1); + } + + while (1) + { + if (convfail2) + break; + else if (wc2 == L'\n' || !iswspace (wc2)) + break; + + t2 += mblen2; + c2 = *t2; + MBC2WC (t2, end2, mblen2, wc2, state2, convfail2); + } + t1 += mblen1; + t2 += mblen2; + break; + + case IGNORE_SPACE_CHANGE: + /* For -b, advance past any sequence of white space in + line 1 and consider it just one space, or nothing at + all if it is at the end of the line. */ + if (wc1 != L'\n' && iswspace (wc1)) + { + size_t mblen_bak; + mbstate_t state_bak; + + do + { + t1 += mblen1; + mblen_bak = mblen1; + state_bak = state1; + MBC2WC (t1, end1, mblen1, wc1, state1, convfail1); + } + while (!convfail1 && (wc1 != L'\n' && iswspace (wc1))); + + state1 = state_bak; + mblen1 = mblen_bak; + t1 -= mblen1; + convfail1 = 0; + wc1 = L' '; + } + + /* Likewise for line 2. */ + if (wc2 != L'\n' && iswspace (wc2)) + { + size_t mblen_bak; + mbstate_t state_bak; + + do + { + t2 += mblen2; + mblen_bak = mblen2; + state_bak = state2; + MBC2WC (t2, end2, mblen2, wc2, state2, convfail2); + } + while (!convfail2 && (wc2 != L'\n' && iswspace (wc2))); + + state2 = state_bak; + mblen2 = mblen_bak; + t2 -= mblen2; + convfail2 = 0; + wc2 = L' '; + } + + if (wc1 != wc2) + { + if (wc2 == L' ' && wc1 != L'\n' && + t1 > (unsigned char const *)s1 && + !convfail1_bak && iswspace (wc1_bak)) + { + t1 = t1_bak; + wc1 = wc1_bak; + state1 = state1_bak; + convfail1 = convfail1_bak; + continue; + } + if (wc1 == L' ' && wc2 != L'\n' + && t2 > (unsigned char const *)s2 + && !convfail2_bak && iswspace (wc2_bak)) + { + t2 = t2_bak; + wc2 = wc2_bak; + state2 = state2_bak; + convfail2 = convfail2_bak; + continue; + } + } + + t1_bak = t1; t2_bak = t2; + wc1_bak = wc1; wc2_bak = wc2; + state1_bak = state1; state2_bak = state2; + convfail1_bak = convfail1; convfail2_bak = convfail2; + + if (wc1 == L'\n') + wc1 = L' '; + else + t1 += mblen1; + + if (wc2 == L'\n') + wc2 = L' '; + else + t2 += mblen2; + + break; + + case IGNORE_TAB_EXPANSION: + if ((wc1 == L' ' && wc2 == L'\t') + || (wc1 == L'\t' && wc2 == L' ')) + { + size_t column2 = column; + + while (1) + { + if (convfail1) + { + ++t1; + break; + } + else if (wc1 == L' ') + column++; + else if (wc1 == L'\t') + column += tabsize - column % tabsize; + else + { + t1 += mblen1; + break; + } + + t1 += mblen1; + c1 = *t1; + MBC2WC (t1, end1, mblen1, wc1, state1, convfail1); + } + + while (1) + { + if (convfail2) + { + ++t2; + break; + } + else if (wc2 == L' ') + column2++; + else if (wc2 == L'\t') + column2 += tabsize - column2 % tabsize; + else + { + t2 += mblen2; + break; + } + + t2 += mblen2; + c2 = *t2; + MBC2WC (t2, end2, mblen2, wc2, state2, convfail2); + } + + if (column != column2) + return 1; + } + else + { + t1 += mblen1; + t2 += mblen2; + } + break; + + case IGNORE_NO_WHITE_SPACE: + t1 += mblen1; + t2 += mblen2; + break; + } + + /* Lowercase all letters if -i is specified. */ + if (ignore_case) + { + if (!convfail1) + wc1 = towlower (wc1); + if (!convfail2) + wc2 = towlower (wc2); + } + + if (convfail1 ^ convfail2) + break; + else if (convfail1 && convfail2 && c1 != c2) + break; + else if (!convfail1 && !convfail2 && wc1 != wc2) + break; + } + else + { + t1_bak = t1; t2_bak = t2; + wc1_bak = wc1; wc2_bak = wc2; + state1_bak = state1; state2_bak = state2; + convfail1_bak = convfail1; convfail2_bak = convfail2; + + t1 += mblen1; t2 += mblen2; + } + + if (!convfail1 && wc1 == L'\n') + return 0; + + column += convfail1 ? 1 : + (wc1 == L'\t') ? tabsize - column % tabsize : wcwidth (wc1); + } + + return 1; +} +#endif /* Find the consecutive changes at the start of the script START. Return the last link before the first gap. */
