I wrote:
>> I wonder whether we need to relax the matching code to be entirely
>> agnostic about spaces and punctuation in the Windows locale name.

> After googling a little bit, I could not find any indication that
> Microsoft promises anything at all about the stability of these
> long-form locale names.  They document short names similar to the
> Unix conventions, e.g. "en-US" and "nb-NO", as being the stable
> forms that applications are encouraged to use.  So somewhere there
> is code that converts these long-form names to the standardized
> representation, and it would be entirely reasonable for that code
> to try to be forgiving.  Thus, it's no surprise that we're getting
> bit by small variations like these.

> I'm inclined to think that we ought to ignore anything that isn't
> an ASCII letter while trying to match these locale names.  That's
> a little bit problematic in terms of what win32setlocale.c does
> today, because it tries to replace "just the matched string",
> but it'd be unclear where the match ends if there are ignorable
> characters.  But probably we could change it so that it just takes
> the translation and then tacks on ".NNNN" if the input ends with
> a dot and digits.

> Maybe case insensitivity would be a good idea too?  The existing
> code hasn't got that refinement, so maybe it's not important,
> but the examples I'm seeing in places like
> https://docs.microsoft.com/en-us/cpp/c-runtime-library/language-strings?view=vs-2019
> are all-lower-case.

Here's a draft patch for that.  I've checked that the logic does
what I expect, but I don't have a way to actually test this thing
in a Windows build.  Anyone?

                        regards, tom lane

diff --git a/src/port/win32setlocale.c b/src/port/win32setlocale.c
index dfa3140..c549c1b 100644
--- a/src/port/win32setlocale.c
+++ b/src/port/win32setlocale.c
@@ -29,6 +29,12 @@
  * in the pg_database system catalog. To work around that, when setlocale()
  * returns that locale name, map it to a pure-ASCII alias for the same
  * locale.
+ *
+ * These workarounds are complicated by the fact that these long-form locale
+ * names aren't particularly static across Windows versions; punctuation and
+ * spacing, for example, can vary.  To ensure we match when we should match,
+ * ignore everything but ASCII letters in the locale name.  (This also eases
+ * recognizing Bokmål.)
  *-------------------------------------------------------------------------
  */
 
@@ -39,16 +45,18 @@
 struct locale_map
 {
 	/*
-	 * String in locale name to replace. Can be a single string (end is NULL),
-	 * or separate start and end strings. If two strings are given, the locale
-	 * name must contain both of them, and everything between them is
-	 * replaced. This is used for a poor-man's regexp search, allowing
-	 * replacement of "start.*end".
+	 * String in locale name to replace.  While matching, we consider only
+	 * plain ASCII letters, and the match is case-insensitive.
 	 */
-	const char *locale_name_start;
-	const char *locale_name_end;
+	const char *locale_name;	/* locale name to search for */
 
 	const char *replacement;	/* string to replace the match with */
+
+	/*
+	 * If this is true, copy any code page specification (trailing .NNNN) from
+	 * the source locale name.
+	 */
+	bool		copy_code_page;
 };
 
 /*
@@ -57,14 +65,14 @@ struct locale_map
 static const struct locale_map locale_map_argument[] = {
 	/*
 	 * "HKG" is listed here:
-	 * http://msdn.microsoft.com/en-us/library/cdax410z%28v=vs.71%29.aspx
+	 * https://docs.microsoft.com/en-us/cpp/c-runtime-library/country-region-strings
 	 * (Country/Region Strings).
 	 *
 	 * "ARE" is the ISO-3166 three-letter code for U.A.E. It is not on the
 	 * above list, but seems to work anyway.
 	 */
-	{"Hong Kong S.A.R.", NULL, "HKG"},
-	{"U.A.E.", NULL, "ARE"},
+	{"Hong Kong S.A.R.", "HKG", true},
+	{"U.A.E.", "ARE", true},
 
 	/*
 	 * The ISO-3166 country code for Macau S.A.R. is MAC, but Windows doesn't
@@ -75,15 +83,15 @@ static const struct locale_map locale_map_argument[] = {
 	 * works.
 	 *
 	 * Note that unlike HKG and ARE, ZHM is an alias for the *whole* locale
-	 * name, not just the country part.
+	 * name, not just the country part, so we suppress any code page spec.
 	 *
 	 * Some versions of Windows spell it "Macau", others "Macao".
 	 */
-	{"Chinese (Traditional)_Macau S.A.R..950", NULL, "ZHM"},
-	{"Chinese_Macau S.A.R..950", NULL, "ZHM"},
-	{"Chinese (Traditional)_Macao S.A.R..950", NULL, "ZHM"},
-	{"Chinese_Macao S.A.R..950", NULL, "ZHM"},
-	{NULL, NULL, NULL}
+	{"Chinese (Traditional)_Macau S.A.R..950", "ZHM", false},
+	{"Chinese_Macau S.A.R..950", "ZHM", false},
+	{"Chinese (Traditional)_Macao S.A.R..950", "ZHM", false},
+	{"Chinese_Macao S.A.R..950", "ZHM", false},
+	{NULL, NULL, false}
 };
 
 /*
@@ -95,14 +103,11 @@ static const struct locale_map locale_map_result[] = {
 	 * Map it to a pure-ASCII alias.
 	 *
 	 * It's not clear what encoding setlocale() uses when it returns the
-	 * locale name, so to play it safe, we search for "Norwegian (Bok*l)".
-	 *
-	 * Just to make life even more complicated, some versions of Windows spell
-	 * the locale name without parentheses.  Translate that too.
+	 * locale name, but since the search will ignore non-ASCII characters, we
+	 * can just leave å out of the match string.
 	 */
-	{"Norwegian (Bokm", "l)_Norway", "Norwegian_Norway"},
-	{"Norwegian Bokm", "l_Norway", "Norwegian_Norway"},
-	{NULL, NULL, NULL}
+	{"Norwegian Bokml Norway", "Norwegian_Norway", true},
+	{NULL, NULL, false}
 };
 
 #define MAX_LOCALE_NAME_LEN		100
@@ -114,54 +119,73 @@ map_locale(const struct locale_map *map, const char *locale)
 	int			i;
 
 	/* Check if the locale name matches any of the problematic ones. */
-	for (i = 0; map[i].locale_name_start != NULL; i++)
+	for (i = 0; map[i].locale_name != NULL; i++)
 	{
-		const char *needle_start = map[i].locale_name_start;
-		const char *needle_end = map[i].locale_name_end;
+		const char *needle = map[i].locale_name;
 		const char *replacement = map[i].replacement;
-		char	   *match;
-		char	   *match_start = NULL;
-		char	   *match_end = NULL;
-
-		match = strstr(locale, needle_start);
-		if (match)
+		bool		match = true;
+		const char *p1,
+				   *p2;
+		const char *codepage;
+		int			replacementlen;
+		int			cplen;
+
+#define ASCII_LETTERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+
+		p1 = locale;
+		p2 = needle;
+		for (;;)
 		{
-			/*
-			 * Found a match for the first part. If this was a two-part
-			 * replacement, find the second part.
-			 */
-			match_start = match;
-			if (needle_end)
+			/* Ignore characters that aren't ASCII letters */
+			while (*p1 && strchr(ASCII_LETTERS, (unsigned char) *p1) == NULL)
+				p1++;
+
+			while (*p2 && strchr(ASCII_LETTERS, (unsigned char) *p2) == NULL)
+				p2++;
+
+			/* Must match case-insensitively */
+			if (*p1 && *p2)
 			{
-				match = strstr(match_start + strlen(needle_start), needle_end);
-				if (match)
-					match_end = match + strlen(needle_end);
-				else
-					match_start = NULL;
+				if (pg_toupper(*p1) != pg_toupper(*p2))
+				{
+					match = false;
+					break;
+				}
+				p1++, p2++;
 			}
 			else
-				match_end = match_start + strlen(needle_start);
+			{
+				if (*p1 || *p2)
+					match = false;	/* one is longer */
+				break;
+			}
 		}
 
-		if (match_start)
+		if (!match)
+			continue;
+
+		/* Found a match.  Should we include the codepage spec, if any? */
+		if (map[i].copy_code_page)
 		{
-			/* Found a match. Replace the matched string. */
-			int			matchpos = match_start - locale;
-			int			replacementlen = strlen(replacement);
-			char	   *rest = match_end;
-			int			restlen = strlen(rest);
-
-			/* check that the result fits in the static buffer */
-			if (matchpos + replacementlen + restlen + 1 > MAX_LOCALE_NAME_LEN)
-				return NULL;
-
-			memcpy(&aliasbuf[0], &locale[0], matchpos);
-			memcpy(&aliasbuf[matchpos], replacement, replacementlen);
-			/* includes null terminator */
-			memcpy(&aliasbuf[matchpos + replacementlen], rest, restlen + 1);
-
-			return aliasbuf;
+			codepage = strrchr(locale, '.');
+			if (!(codepage && codepage[1] &&
+				  strspn(codepage + 1, "0123456789") == strlen(codepage + 1)))
+				codepage = NULL;
 		}
+		else
+			codepage = NULL;
+
+		/* check that the result fits in the static buffer */
+		replacementlen = strlen(replacement);
+		cplen = (codepage ? strlen(codepage) : 0);
+		if (replacementlen + cplen + 1 > MAX_LOCALE_NAME_LEN)
+			break;				/* treat as no-match */
+
+		memcpy(&aliasbuf[0], replacement, replacementlen + 1);
+		if (codepage)
+			memcpy(&aliasbuf[replacementlen], codepage, cplen + 1);
+
+		return aliasbuf;
 	}
 
 	/* no match, just return the original string */

Reply via email to