Re: Update Unicode data to Unicode 16.0.0

Jeff Davis Wed, 05 Feb 2025 13:47:37 -0800

On Mon, 2024-11-11 at 07:27 +0100, Peter Eisentraut wrote:
> Here is the patch to update the Unicode data to version 16.0.0.
> 
> Normally, this would have been routine, but a few months ago there
> was 
> some debate about how this should be handled. [0]  AFAICT, the
> consensus 
> was to go ahead with it, but I just wanted to notify it here to be
> clear.


We discussed $SUBJECT at the Developer Meeting before FOSDEM.

Those people who were most concerned about the Unicode updates on the
list were not present, so I don't consider the discussion to be
binding. But the attendees present agreed that:

  (a) we should not block the update to Unicode indefinitely; and
  (b) we should make reasonable attempts to mitigate potential
problems.

One idea for (b) resurfaced, which was to make a best-effort check at
pg_upgrade time for affected indexes. The check would not be
bulletproof, because we can't catch dependencies that are hidden inside
SPI (e.g. a plpgsql function that calls LOWER()), but it would catch
most potential problems.

Patch attached. A few notes:

  * The dependency entries don't exist because LOWER(), etc., are
system objects (pinned); so it queries the indexprs, indpreds,
partexprs, and conbin.
  * The query is large and perhaps too clever, but it seems to work. I
tried to add inline comments to the SQL, and pgindent had its own ideas
about how to format them -- suggestions welcome.
  * We haven't actually done the Unicode update yet, so it will notice
that the PG17 and PG18 Unicode versions are the same, and return early.
Either apply on top of the Unicode update patch, or comment out the
early return for testing.
  * It emits a warning rather than an error, so you need to specify
pg_upgrade with "-r" to see the output file.
  * I didn't adapt the query to run on pre-17 versions, even though it
could find some potential problem cases (like an index on NORMALIZE()).
I can add that if someone thinks it's worthwhile.

Regards,
        Jeff Davis

From 9d7097edff9230bc9ced9757eb7f23b3ac267b0f Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 3 Feb 2025 13:02:37 -0800
Subject: [PATCH v1] Add pg_upgrade check for Unicode-dependent relations.

---
 src/bin/pg_upgrade/check.c | 179 +++++++++++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)

diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index 7ca1d8fffc9..17ca1066906 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -13,6 +13,7 @@
 #include "catalog/pg_class_d.h"
 #include "fe_utils/string_utils.h"
 #include "pg_upgrade.h"
+#include "common/unicode_version.h"
 
 static void check_new_cluster_is_empty(void);
 static void check_is_install_user(ClusterInfo *cluster);
@@ -25,6 +26,7 @@ static void check_for_tables_with_oids(ClusterInfo *cluster);
 static void check_for_pg_role_prefix(ClusterInfo *cluster);
 static void check_for_new_tablespace_dir(void);
 static void check_for_user_defined_encoding_conversions(ClusterInfo *cluster);
+static void check_for_unicode_update(ClusterInfo *cluster);
 static void check_new_cluster_logical_replication_slots(void);
 static void check_new_cluster_subscription_configuration(void);
 static void check_old_cluster_for_valid_slots(void);
@@ -633,6 +635,12 @@ check_and_dump_old_cluster(void)
 
 	check_for_data_types_usage(&old_cluster);
 
+	/*
+	 * Unicode updates can affect some objects that use expressions with
+	 * functions dependent on Unicode.
+	 */
+	check_for_unicode_update(&old_cluster);
+
 	/*
 	 * PG 14 changed the function signature of encoding conversion functions.
 	 * Conversions from older versions cannot be upgraded automatically
@@ -1740,6 +1748,177 @@ check_for_user_defined_encoding_conversions(ClusterInfo *cluster)
 		check_ok();
 }
 
+/*
+ * Callback function for processing results of query for
+ * check_for_unicode_update()'s UpgradeTask.  If the query returned any rows
+ * (i.e., the check failed), write the details to the report file.
+ */
+static void
+process_unicode_update(DbInfo *dbinfo, PGresult *res, void *arg)
+{
+	UpgradeTaskReport *report = (UpgradeTaskReport *) arg;
+	int			ntups = PQntuples(res);
+	int			i_reloid = PQfnumber(res, "reloid");
+	int			i_nspname = PQfnumber(res, "nspname");
+	int			i_relname = PQfnumber(res, "relname");
+
+	if (ntups == 0)
+		return;
+
+	if (report->file == NULL &&
+		(report->file = fopen_priv(report->path, "w")) == NULL)
+		pg_fatal("could not open file \"%s\": %m", report->path);
+
+	fprintf(report->file, "In database: %s\n", dbinfo->db_name);
+
+	for (int rowno = 0; rowno < ntups; rowno++)
+		fprintf(report->file, "  (oid=%s) %s.%s\n",
+				PQgetvalue(res, rowno, i_reloid),
+				PQgetvalue(res, rowno, i_nspname),
+				PQgetvalue(res, rowno, i_relname));
+}
+
+/*
+ * Check if the Unicode version built into Postgres changed between the old
+ * cluster and the new cluster.
+ */
+static bool
+unicode_version_changed(ClusterInfo *cluster)
+{
+	PGconn	   *conn_template1 = connectToServer(cluster, "template1");
+	PGresult   *res;
+	char	   *old_unicode_version;
+	bool		unicode_updated;
+
+	res = executeQueryOrDie(conn_template1, "SELECT unicode_version()");
+	old_unicode_version = PQgetvalue(res, 0, 0);
+	unicode_updated = (strcmp(old_unicode_version, PG_UNICODE_VERSION) != 0);
+
+	PQclear(res);
+	PQfinish(conn_template1);
+
+	return unicode_updated;
+}
+
+/*
+ * check_for_unicode_update()
+ *
+ * Check if the version of Unicode in the old server and the new server
+ * differ. If so, check for indexes, partitioned tables, or constraints that
+ * use expressions with functions dependent on Unicode behavior.
+ */
+static void
+check_for_unicode_update(ClusterInfo *cluster)
+{
+	UpgradeTaskReport report;
+	UpgradeTask *task = upgrade_task_create();
+	const char *query;
+
+	/*
+	 * The builtin provider did not exist prior to version 17. While there are
+	 * still problems that could potentially be caught from earlier versions,
+	 * such as an index on NORMALIZE(), we don't check for that here.
+	 */
+	if (GET_MAJOR_VERSION(cluster->major_version) < 1700)
+		return;
+
+	prep_status("Checking for objects affected by Unicode update");
+
+	if (!unicode_version_changed(cluster))
+	{
+		check_ok();
+		return;
+	}
+
+	report.file = NULL;
+	snprintf(report.path, sizeof(report.path), "%s/%s",
+			 log_opts.basedir,
+			 "unicode_dependent_rels.txt");
+
+	query =
+	/* collations that use built-in Unicode for character semantics */
+		"WITH collations(collid) AS ( "
+		"  SELECT oid FROM pg_collation "
+		"  WHERE collprovider='b' AND colllocale IN ('C.UTF-8','PG_UNICODE_FAST') "
+	/* include default collation, if appropriate */
+		"  UNION "
+		"  SELECT 'pg_catalog.default'::regcollation FROM pg_database "
+		"  WHERE datname = current_database() AND "
+		"  datlocprovider='b' AND datlocale IN ('C.UTF-8','PG_UNICODE_FAST') "
+		"), "
+	/* functions that use built-in Unicode */
+		"functions(procid) AS ( "
+		"  SELECT proc.oid FROM pg_proc proc "
+		"  WHERE proname IN ('normalize','unicode_assigned','unicode_version','is_normalized') AND "
+		"        pronamespace='pg_catalog'::regnamespace "
+		"), "
+	/* operators that use the input collation for character semantics */
+		"coll_operators(operid, procid, collid) AS ( "
+		"  SELECT oper.oid, oper.oprcode, collid FROM pg_operator oper, collations "
+		"  WHERE oprname IN ('~', '~*', '!~', '!~*', '~~*', '!~~*') AND "
+		"        oprnamespace='pg_catalog'::regnamespace AND "
+		"        oprright='text'::regtype "
+		"), "
+	/* functions that use the input collation for character semantics */
+		"coll_functions(procid, collid) AS ( "
+		"  SELECT proc.oid, collid FROM pg_proc proc, collations "
+		"  WHERE proname IN ('lower','initcap','upper') AND "
+		"        pronamespace='pg_catalog'::regnamespace AND "
+		"        proargtypes[0] = 'text'::regtype "
+	/* include functions behind the operators listed above */
+		"  UNION "
+		"  SELECT procid, collid FROM coll_operators "
+		"), "
+
+	/*
+	 * Generate patterns to search a pg_node_tree for the above functions and
+	 * operators.
+	 */
+		"patterns(p) AS ( "
+		"  SELECT '{FUNCEXPR :funcid ' || procid::text || '[ }]' FROM functions "
+		"  UNION "
+		"  SELECT '{OPEXPR :opno ' || operid::text || ' (:\\w+ \\w+ )*' || "
+		"         ':inputcollid ' || collid::text || '[ }]' FROM coll_operators "
+		"  UNION "
+		"  SELECT '{FUNCEXPR :funcid ' || procid::text || ' (:\\w+ \\w+ )*' || "
+		"         ':inputcollid ' || collid::text || '[ }]' FROM coll_functions "
+		") "
+
+	/*
+	 * Match the patterns against expressions used for relation contents.
+	 */
+		"SELECT reloid, relkind, nspname, relname "
+		"  FROM ( "
+		"    SELECT conrelid "
+		"    FROM pg_constraint, patterns WHERE conbin::text ~ p "
+		"  UNION "
+		"    SELECT indexrelid "
+		"    FROM pg_index, patterns WHERE indexprs::text ~ p OR indpred::text ~ p "
+		"  UNION "
+		"    SELECT partrelid "
+		"    FROM pg_partitioned_table, patterns WHERE partexprs::text ~ p "
+		"  ) s(reloid), pg_class c, pg_namespace n, pg_database d "
+		"  WHERE s.reloid = c.oid AND c.relnamespace = n.oid AND "
+		"        d.datname = current_database() AND "
+		"        d.encoding = pg_char_to_encoding('UTF8');";
+
+	upgrade_task_add_step(task, query,
+						  process_unicode_update,
+						  true, &report);
+	upgrade_task_run(task, cluster);
+	upgrade_task_free(task);
+
+	if (report.file)
+	{
+		fclose(report.file);
+		pg_log(PG_WARNING, "Your installation contains relations that may be affected by a new version of Unicode.\n"
+			   "A list of potentially-affected relations is in the file:\n"
+			   "    %s", report.path);
+	}
+	else
+		check_ok();
+}
+
 /*
  * check_new_cluster_logical_replication_slots()
  *
-- 
2.34.1

Re: Update Unicode data to Unicode 16.0.0

Reply via email to