On Mon, 2024-11-11 at 07:27 +0100, Peter Eisentraut wrote: > Here is the patch to update the Unicode data to version 16.0.0. > > Normally, this would have been routine, but a few months ago there > was > some debate about how this should be handled. [0] AFAICT, the > consensus > was to go ahead with it, but I just wanted to notify it here to be > clear.
We discussed $SUBJECT at the Developer Meeting before FOSDEM. Those people who were most concerned about the Unicode updates on the list were not present, so I don't consider the discussion to be binding. But the attendees present agreed that: (a) we should not block the update to Unicode indefinitely; and (b) we should make reasonable attempts to mitigate potential problems. One idea for (b) resurfaced, which was to make a best-effort check at pg_upgrade time for affected indexes. The check would not be bulletproof, because we can't catch dependencies that are hidden inside SPI (e.g. a plpgsql function that calls LOWER()), but it would catch most potential problems. Patch attached. A few notes: * The dependency entries don't exist because LOWER(), etc., are system objects (pinned); so it queries the indexprs, indpreds, partexprs, and conbin. * The query is large and perhaps too clever, but it seems to work. I tried to add inline comments to the SQL, and pgindent had its own ideas about how to format them -- suggestions welcome. * We haven't actually done the Unicode update yet, so it will notice that the PG17 and PG18 Unicode versions are the same, and return early. Either apply on top of the Unicode update patch, or comment out the early return for testing. * It emits a warning rather than an error, so you need to specify pg_upgrade with "-r" to see the output file. * I didn't adapt the query to run on pre-17 versions, even though it could find some potential problem cases (like an index on NORMALIZE()). I can add that if someone thinks it's worthwhile. Regards, Jeff Davis
From 9d7097edff9230bc9ced9757eb7f23b3ac267b0f Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Mon, 3 Feb 2025 13:02:37 -0800 Subject: [PATCH v1] Add pg_upgrade check for Unicode-dependent relations. --- src/bin/pg_upgrade/check.c | 179 +++++++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 7ca1d8fffc9..17ca1066906 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -13,6 +13,7 @@ #include "catalog/pg_class_d.h" #include "fe_utils/string_utils.h" #include "pg_upgrade.h" +#include "common/unicode_version.h" static void check_new_cluster_is_empty(void); static void check_is_install_user(ClusterInfo *cluster); @@ -25,6 +26,7 @@ static void check_for_tables_with_oids(ClusterInfo *cluster); static void check_for_pg_role_prefix(ClusterInfo *cluster); static void check_for_new_tablespace_dir(void); static void check_for_user_defined_encoding_conversions(ClusterInfo *cluster); +static void check_for_unicode_update(ClusterInfo *cluster); static void check_new_cluster_logical_replication_slots(void); static void check_new_cluster_subscription_configuration(void); static void check_old_cluster_for_valid_slots(void); @@ -633,6 +635,12 @@ check_and_dump_old_cluster(void) check_for_data_types_usage(&old_cluster); + /* + * Unicode updates can affect some objects that use expressions with + * functions dependent on Unicode. + */ + check_for_unicode_update(&old_cluster); + /* * PG 14 changed the function signature of encoding conversion functions. * Conversions from older versions cannot be upgraded automatically @@ -1740,6 +1748,177 @@ check_for_user_defined_encoding_conversions(ClusterInfo *cluster) check_ok(); } +/* + * Callback function for processing results of query for + * check_for_unicode_update()'s UpgradeTask. If the query returned any rows + * (i.e., the check failed), write the details to the report file. + */ +static void +process_unicode_update(DbInfo *dbinfo, PGresult *res, void *arg) +{ + UpgradeTaskReport *report = (UpgradeTaskReport *) arg; + int ntups = PQntuples(res); + int i_reloid = PQfnumber(res, "reloid"); + int i_nspname = PQfnumber(res, "nspname"); + int i_relname = PQfnumber(res, "relname"); + + if (ntups == 0) + return; + + if (report->file == NULL && + (report->file = fopen_priv(report->path, "w")) == NULL) + pg_fatal("could not open file \"%s\": %m", report->path); + + fprintf(report->file, "In database: %s\n", dbinfo->db_name); + + for (int rowno = 0; rowno < ntups; rowno++) + fprintf(report->file, " (oid=%s) %s.%s\n", + PQgetvalue(res, rowno, i_reloid), + PQgetvalue(res, rowno, i_nspname), + PQgetvalue(res, rowno, i_relname)); +} + +/* + * Check if the Unicode version built into Postgres changed between the old + * cluster and the new cluster. + */ +static bool +unicode_version_changed(ClusterInfo *cluster) +{ + PGconn *conn_template1 = connectToServer(cluster, "template1"); + PGresult *res; + char *old_unicode_version; + bool unicode_updated; + + res = executeQueryOrDie(conn_template1, "SELECT unicode_version()"); + old_unicode_version = PQgetvalue(res, 0, 0); + unicode_updated = (strcmp(old_unicode_version, PG_UNICODE_VERSION) != 0); + + PQclear(res); + PQfinish(conn_template1); + + return unicode_updated; +} + +/* + * check_for_unicode_update() + * + * Check if the version of Unicode in the old server and the new server + * differ. If so, check for indexes, partitioned tables, or constraints that + * use expressions with functions dependent on Unicode behavior. + */ +static void +check_for_unicode_update(ClusterInfo *cluster) +{ + UpgradeTaskReport report; + UpgradeTask *task = upgrade_task_create(); + const char *query; + + /* + * The builtin provider did not exist prior to version 17. While there are + * still problems that could potentially be caught from earlier versions, + * such as an index on NORMALIZE(), we don't check for that here. + */ + if (GET_MAJOR_VERSION(cluster->major_version) < 1700) + return; + + prep_status("Checking for objects affected by Unicode update"); + + if (!unicode_version_changed(cluster)) + { + check_ok(); + return; + } + + report.file = NULL; + snprintf(report.path, sizeof(report.path), "%s/%s", + log_opts.basedir, + "unicode_dependent_rels.txt"); + + query = + /* collations that use built-in Unicode for character semantics */ + "WITH collations(collid) AS ( " + " SELECT oid FROM pg_collation " + " WHERE collprovider='b' AND colllocale IN ('C.UTF-8','PG_UNICODE_FAST') " + /* include default collation, if appropriate */ + " UNION " + " SELECT 'pg_catalog.default'::regcollation FROM pg_database " + " WHERE datname = current_database() AND " + " datlocprovider='b' AND datlocale IN ('C.UTF-8','PG_UNICODE_FAST') " + "), " + /* functions that use built-in Unicode */ + "functions(procid) AS ( " + " SELECT proc.oid FROM pg_proc proc " + " WHERE proname IN ('normalize','unicode_assigned','unicode_version','is_normalized') AND " + " pronamespace='pg_catalog'::regnamespace " + "), " + /* operators that use the input collation for character semantics */ + "coll_operators(operid, procid, collid) AS ( " + " SELECT oper.oid, oper.oprcode, collid FROM pg_operator oper, collations " + " WHERE oprname IN ('~', '~*', '!~', '!~*', '~~*', '!~~*') AND " + " oprnamespace='pg_catalog'::regnamespace AND " + " oprright='text'::regtype " + "), " + /* functions that use the input collation for character semantics */ + "coll_functions(procid, collid) AS ( " + " SELECT proc.oid, collid FROM pg_proc proc, collations " + " WHERE proname IN ('lower','initcap','upper') AND " + " pronamespace='pg_catalog'::regnamespace AND " + " proargtypes[0] = 'text'::regtype " + /* include functions behind the operators listed above */ + " UNION " + " SELECT procid, collid FROM coll_operators " + "), " + + /* + * Generate patterns to search a pg_node_tree for the above functions and + * operators. + */ + "patterns(p) AS ( " + " SELECT '{FUNCEXPR :funcid ' || procid::text || '[ }]' FROM functions " + " UNION " + " SELECT '{OPEXPR :opno ' || operid::text || ' (:\\w+ \\w+ )*' || " + " ':inputcollid ' || collid::text || '[ }]' FROM coll_operators " + " UNION " + " SELECT '{FUNCEXPR :funcid ' || procid::text || ' (:\\w+ \\w+ )*' || " + " ':inputcollid ' || collid::text || '[ }]' FROM coll_functions " + ") " + + /* + * Match the patterns against expressions used for relation contents. + */ + "SELECT reloid, relkind, nspname, relname " + " FROM ( " + " SELECT conrelid " + " FROM pg_constraint, patterns WHERE conbin::text ~ p " + " UNION " + " SELECT indexrelid " + " FROM pg_index, patterns WHERE indexprs::text ~ p OR indpred::text ~ p " + " UNION " + " SELECT partrelid " + " FROM pg_partitioned_table, patterns WHERE partexprs::text ~ p " + " ) s(reloid), pg_class c, pg_namespace n, pg_database d " + " WHERE s.reloid = c.oid AND c.relnamespace = n.oid AND " + " d.datname = current_database() AND " + " d.encoding = pg_char_to_encoding('UTF8');"; + + upgrade_task_add_step(task, query, + process_unicode_update, + true, &report); + upgrade_task_run(task, cluster); + upgrade_task_free(task); + + if (report.file) + { + fclose(report.file); + pg_log(PG_WARNING, "Your installation contains relations that may be affected by a new version of Unicode.\n" + "A list of potentially-affected relations is in the file:\n" + " %s", report.path); + } + else + check_ok(); +} + /* * check_new_cluster_logical_replication_slots() * -- 2.34.1