On Sun, Aug 04, 2013 at 01:33:36PM +0200, John Darrington wrote: > I'm confused by this line in encoding-guesser.c > (encoding_guess_parse_encoding) > > return is_encoding_utf8 (fallback) ? "windows-1252" : fallback; > > If we have determined that the fallback encoding is UTF8, why return > windows-1252 ? > > Is this a typo ?
It is a bug, but there was more to it than a typo. That logic actually made sense in a couple of places (where we already had figured out that the data could not be encoded in UTF-8). It did not make sense in other places. I applied the following commit that fixes the problem. Thanks for finding this! --8<--------------------------cut here-------------------------->8-- From: Ben Pfaff <b...@cs.stanford.edu> Date: Sun, 4 Aug 2013 19:42:29 -0700 Subject: [PATCH] encoding-guesser: Fix bug in parsing fallback encodings. The encoding guesser originally used UTF-8 as a fallback in situations where it had already figured out that UTF-8 couldn't be right. Commit d6c75296 (encoding-guesser: Fall back to windows-1252 when UTF-8 can't be right.) fixed this, by falling back to window-1252 when UTF-8 was known to be wrong, but it also introduced a bug where this change to the fallback encoding occurred even when UTF-8 had not be determined to be wrong. This commit fixes that problem. Reported by John Darrington. --- src/libpspp/encoding-guesser.c | 45 ++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/src/libpspp/encoding-guesser.c b/src/libpspp/encoding-guesser.c index b55f24a..9518bd6 100644 --- a/src/libpspp/encoding-guesser.c +++ b/src/libpspp/encoding-guesser.c @@ -43,19 +43,15 @@ const char * encoding_guess_parse_encoding (const char *encoding) { - const char *fallback; - if (encoding == NULL || !c_strcasecmp (encoding, "auto") || !c_strcasecmp (encoding, "auto,locale") || !c_strcasecmp (encoding, "locale")) - fallback = locale_charset (); + return locale_charset (); else if (!c_strncasecmp (encoding, "auto,", 5)) - fallback = encoding + 5; + return encoding + 5; else return encoding; - - return is_encoding_utf8 (fallback) ? "windows-1252" : fallback; } /* Returns true if ENCODING, which must be in one of the forms described at the @@ -277,10 +273,25 @@ encoding_guess_head_encoding (const char *encoding, if (is_utf32 (data, n, get_le32)) return "UTF-32LE"; - if (!is_encoding_ascii_compatible (fallback_encoding) - || !encoding_guess_tail_is_utf8 (data, n)) + /* We've tried all the "giveaways" that make the encoding obvious. That + rules out, incidentally, all the encodings with multibyte units + (e.g. UTF-16, UTF-32). Our remaining goal is to try to distinguish UTF-8 + from some ASCII-based fallback encoding. */ + + /* If the fallback encoding isn't ASCII compatible, give up. */ + if (!is_encoding_ascii_compatible (fallback_encoding)) return fallback_encoding; + /* If the data we have clearly is not UTF-8, give up. */ + if (!encoding_guess_tail_is_utf8 (data, n)) + { + /* If the fallback encoding is UTF-8, fall back on something else.*/ + if (is_encoding_utf8 (fallback_encoding)) + return "windows-1252"; + + return fallback_encoding; + } + return "ASCII"; } @@ -333,9 +344,21 @@ const char * encoding_guess_tail_encoding (const char *encoding, const void *data, size_t n) { - return (encoding_guess_tail_is_utf8 (data, n) != 0 - ? "UTF-8" - : encoding_guess_parse_encoding (encoding)); + + if (encoding_guess_tail_is_utf8 (data, n) != 0) + return "UTF-8"; + else + { + /* The data is not UTF-8. */ + const char *fallback_encoding = encoding_guess_parse_encoding (encoding); + + /* If the fallback encoding is UTF-8, fall back on something else.*/ + if (is_encoding_utf8 (fallback_encoding)) + return "windows-1252"; + + return fallback_encoding; + } + } /* Returns an encoding guess based on ENCODING and the N bytes of text starting -- 1.7.10.4 _______________________________________________ pspp-dev mailing list pspp-dev@gnu.org https://lists.gnu.org/mailman/listinfo/pspp-dev