On Tue, Jan 28, 2020 at 9:35 PM Peter Eisentraut <
[email protected]> wrote:
> On 2020-01-28 16:47, Juan José Santamaría Flecha wrote:
> > This patch targets to do something symmetrical to to_char(), which will
> > just return a single value.
>
> I didn't fully realize while reading this thread that to_char() already
> supports localized output and this patch indeed just wants to do the
> opposite.
>
> So I'm withdrawing my concerns with respect to this patch. As long as
> it can do a roundtrip conversion with to_char(), it's fine.
>
>
We can avoid issues with non injective case conversion languages with a
double conversion, so both strings in the comparison end up in the same
state.
I propose an upper/lower conversion as in the attached patch.
Regards,
Juan José Santamaría Flecha
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index ceda48e..b1951e5 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -5968,7 +5968,7 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
</row>
<row>
<entry><literal>TM</literal> prefix</entry>
- <entry>translation mode (print localized day and month names based on
+ <entry>translation mode (use localized day and month names based on
<xref linkend="guc-lc-time"/>)</entry>
<entry><literal>TMMonth</literal></entry>
</row>
@@ -6000,8 +6000,13 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
<listitem>
<para>
<literal>TM</literal> does not include trailing blanks.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
<function>to_timestamp</function> and <function>to_date</function> ignore
- the <literal>TM</literal> modifier.
+ the case when receiving names as an input.
</para>
</listitem>
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index f58331d..e5b4eb5 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -1059,9 +1059,11 @@ static int from_char_parse_int_len(int *dest, const char **src, const int len,
FormatNode *node, bool *have_error);
static int from_char_parse_int(int *dest, const char **src, FormatNode *node,
bool *have_error);
-static int seq_search(const char *name, const char *const *array, int *len);
+static int seq_search_ascii(const char *name, const char *const *array, int *len);
+static int seq_search_localized(const char *name, char **array, int *len);
static int from_char_seq_search(int *dest, const char **src,
const char *const *array,
+ char **localized_array,
FormatNode *node, bool *have_error);
static void do_to_timestamp(text *date_txt, text *fmt, bool std,
struct pg_tm *tm, fsec_t *fsec, int *fprec,
@@ -2459,7 +2461,7 @@ from_char_parse_int(int *dest, const char **src, FormatNode *node, bool *have_er
* suitable for comparisons to ASCII strings.
*/
static int
-seq_search(const char *name, const char *const *array, int *len)
+seq_search_ascii(const char *name, const char *const *array, int *len)
{
unsigned char firstc;
const char *const *a;
@@ -2505,8 +2507,74 @@ seq_search(const char *name, const char *const *array, int *len)
}
/*
- * Perform a sequential search in 'array' for an entry matching the first
- * character(s) of the 'src' string case-insensitively.
+ * Sequentially search an array of possibly non-English words for
+ * a case-insensitive match to the initial character(s) of "name".
+ *
+ * This has the same API as seq_search_ascii(), but we use a more general
+ * downcasing transformation to achieve case-insensitivity.
+ *
+ * The array is treated as const, but we don't declare it that way because
+ * the arrays exported by pg_locale.c aren't const.
+ */
+static int
+seq_search_localized(const char *name, char **array, int *len)
+{
+ char **a;
+ char *lower_name;
+ char *upper_name;
+
+ *len = 0;
+
+ /* empty string can't match anything */
+ if (!*name)
+ return -1;
+
+ /*
+ * We do an upper/lower conversion to avoid problems with languages
+ * in which case conversions are not injective.
+ */
+ upper_name = str_toupper(unconstify(char *, name), strlen(name),
+ DEFAULT_COLLATION_OID);
+ lower_name = str_tolower(upper_name, strlen(upper_name),
+ DEFAULT_COLLATION_OID);
+ pfree(upper_name);
+
+ for (a = array; *a != NULL; a++)
+ {
+ char *lower_element;
+ char *upper_element;
+ int element_len;
+
+ /* Upper/lower-case array element, assuming it is normalized */
+ upper_element = str_toupper(*a, strlen(*a), DEFAULT_COLLATION_OID);
+ lower_element = str_tolower(upper_element, strlen(upper_element),
+ DEFAULT_COLLATION_OID);
+ pfree(upper_element);
+ element_len = strlen(lower_element);
+
+ /* Match? */
+ if (strncmp(lower_name, lower_element, element_len) == 0)
+ {
+ *len = element_len;
+ pfree(lower_element);
+ pfree(lower_name);
+ return a - array;
+ }
+ pfree(lower_element);
+ }
+
+ pfree(lower_name);
+ return -1;
+}
+
+/*
+ * Perform a sequential search in 'array' (or 'localized_array', if that's
+ * not NULL) for an entry matching the first character(s) of the 'src'
+ * string case-insensitively.
+ *
+ * The 'array' is presumed to be English words (all-ASCII), but
+ * if 'localized_array' is supplied, that might be non-English
+ * so we need a more expensive downcasing transformation.
*
* If a match is found, copy the array index of the match into the integer
* pointed to by 'dest', advance 'src' to the end of the part of the string
@@ -2520,11 +2588,15 @@ seq_search(const char *name, const char *const *array, int *len)
*/
static int
from_char_seq_search(int *dest, const char **src, const char *const *array,
+ char **localized_array,
FormatNode *node, bool *have_error)
{
int len;
- *dest = seq_search(*src, array, &len);
+ if (localized_array == NULL)
+ *dest = seq_search_ascii(*src, array, &len);
+ else
+ *dest = seq_search_localized(*src, localized_array, &len);
if (len <= 0)
{
@@ -3172,6 +3244,9 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
/* number of extra skipped characters (more than given in format string) */
int extra_skip = 0;
+ /* cache localized days and months */
+ cache_locale_time();
+
for (n = node, s = in; n->type != NODE_TYPE_END && *s != '\0'; n++)
{
/*
@@ -3272,7 +3347,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_P_M:
case DCH_a_m:
case DCH_p_m:
- from_char_seq_search(&value, &s, ampm_strings_long,
+ from_char_seq_search(&value, &s, ampm_strings_long, NULL,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->pm, value % 2, n, have_error);
@@ -3283,7 +3358,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_PM:
case DCH_am:
case DCH_pm:
- from_char_seq_search(&value, &s, ampm_strings,
+ from_char_seq_search(&value, &s, ampm_strings, NULL,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->pm, value % 2, n, have_error);
@@ -3396,7 +3471,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_B_C:
case DCH_a_d:
case DCH_b_c:
- from_char_seq_search(&value, &s, adbc_strings_long,
+ from_char_seq_search(&value, &s, adbc_strings_long, NULL,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->bc, value % 2, n, have_error);
@@ -3406,7 +3481,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_BC:
case DCH_ad:
case DCH_bc:
- from_char_seq_search(&value, &s, adbc_strings,
+ from_char_seq_search(&value, &s, adbc_strings, NULL,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->bc, value % 2, n, have_error);
@@ -3416,6 +3491,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_Month:
case DCH_month:
from_char_seq_search(&value, &s, months_full,
+ S_TM(n->suffix) ? localized_full_months : NULL,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->mm, value + 1, n, have_error);
@@ -3425,6 +3501,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_Mon:
case DCH_mon:
from_char_seq_search(&value, &s, months,
+ S_TM(n->suffix) ? localized_abbrev_months : NULL,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->mm, value + 1, n, have_error);
@@ -3439,6 +3516,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_Day:
case DCH_day:
from_char_seq_search(&value, &s, days,
+ S_TM(n->suffix) ? localized_full_days : NULL,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->d, value, n, have_error);
@@ -3449,6 +3527,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
case DCH_Dy:
case DCH_dy:
from_char_seq_search(&value, &s, days_short,
+ S_TM(n->suffix) ? localized_abbrev_days : NULL,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->d, value, n, have_error);
@@ -3566,7 +3645,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
break;
case DCH_RM:
case DCH_rm:
- from_char_seq_search(&value, &s, rm_months_lower,
+ from_char_seq_search(&value, &s, rm_months_lower, NULL,
n, have_error);
CHECK_ERROR;
from_char_set_int(&out->mm, MONTHS_PER_YEAR - value,
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 25fb7e2..64fd3ae 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -96,11 +96,17 @@ char *locale_monetary;
char *locale_numeric;
char *locale_time;
-/* lc_time localization cache */
-char *localized_abbrev_days[7];
-char *localized_full_days[7];
-char *localized_abbrev_months[12];
-char *localized_full_months[12];
+/*
+ * lc_time localization cache.
+ *
+ * We use only the first 7 or 12 entries of these arrays. The last array
+ * element is left as NULL for the convenience of outside code that wants
+ * to sequentially scan these arrays.
+ */
+char *localized_abbrev_days[7 + 1];
+char *localized_full_days[7 + 1];
+char *localized_abbrev_months[12 + 1];
+char *localized_full_months[12 + 1];
/* indicates whether locale information cache is valid */
static bool CurrentLocaleConvValid = false;
@@ -922,6 +928,8 @@ cache_locale_time(void)
cache_single_string(&localized_full_days[i], bufptr, encoding);
bufptr += MAX_L10N_DATA;
}
+ localized_abbrev_days[7] = NULL;
+ localized_full_days[7] = NULL;
/* localized months */
for (i = 0; i < 12; i++)
@@ -931,6 +939,8 @@ cache_locale_time(void)
cache_single_string(&localized_full_months[i], bufptr, encoding);
bufptr += MAX_L10N_DATA;
}
+ localized_abbrev_months[12] = NULL;
+ localized_full_months[12] = NULL;
CurrentLCTimeValid = true;
}
diff --git a/src/test/regress/expected/collate.linux.utf8.out b/src/test/regress/expected/collate.linux.utf8.out
index 37c6add..68b8f14 100644
--- a/src/test/regress/expected/collate.linux.utf8.out
+++ b/src/test/regress/expected/collate.linux.utf8.out
@@ -461,6 +461,16 @@ SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR");
01 NİS 2010
(1 row)
+-- to_date
+SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY');
+ to_date
+------------
+ 02-01-2010
+(1 row)
+
+SELECT to_date('1234567890ab 2010', 'TMMONTH YYYY'); -- fail
+ERROR: invalid value "1234567890ab" for "MONTH"
+DETAIL: The given value did not match any of the allowed values for this field.
-- backwards parsing
CREATE VIEW collview1 AS SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc';
CREATE VIEW collview2 AS SELECT a, b FROM collate_test1 ORDER BY b COLLATE "C";
diff --git a/src/test/regress/sql/collate.linux.utf8.sql b/src/test/regress/sql/collate.linux.utf8.sql
index 8c26f16..3d9a2ed 100644
--- a/src/test/regress/sql/collate.linux.utf8.sql
+++ b/src/test/regress/sql/collate.linux.utf8.sql
@@ -182,6 +182,11 @@ SELECT to_char(date '2010-02-01', 'DD TMMON YYYY' COLLATE "tr_TR");
SELECT to_char(date '2010-04-01', 'DD TMMON YYYY');
SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR");
+-- to_date
+
+SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY');
+SELECT to_date('1234567890ab 2010', 'TMMONTH YYYY'); -- fail
+
-- backwards parsing