From dd5eb976fdefc447826b0310a782c2848c3f21a1 Mon Sep 17 00:00:00 2001
From: DoGeon Yoo <ehrjs023@gmail.com>
Date: Thu, 14 May 2026 15:44:19 +0900
Subject: [PATCH v1 1/2] Add regression test for UHC encoding (baseline
 capture)

UHC is a client-only encoding, so pg_uhc_verifychar() can only be
exercised indirectly through convert_from() in a UTF8 database.
There has been no dedicated regression test for it.

This commit adds src/test/regress/sql/uhc.sql covering:

- valid two-byte sequences at the CP949 lead/trail boundaries
  (trail 0x41, 0x5A, 0x61, 0x7A, 0x81, 0xFE; high leads 0xC7, 0xFD)
- invalid lead bytes (0x80, 0xFF)
- invalid trail bytes (0x40, 0x5B, 0x60, 0x7B, 0x80, 0xFF)
- the NONUTF8_INVALID sentinel pair (0x8d 0x20)
- a truncated two-byte character

The expected output records the *current* behavior on master, not
the desired behavior.  In particular, the eight invalid-lead and
invalid-trail cases (0x80 0x41, 0xFF 0x41, 0x81 0x40, ...) are
currently accepted by pg_uhc_verifychar() and rejected only later
by the conversion table with "character with byte sequence ... has
no equivalent in encoding UTF8".

Capturing this behavior here makes the follow-up patch's diff
self-evident: a subsequent commit that tightens pg_uhc_verifychar()
to enforce the CP949 lead/trail byte ranges will turn those eight
"has no equivalent" errors into "invalid byte sequence" errors,
without changing any other test result.

uhc_1.out provides an early \quit fallback for non-UTF8 databases.
---
 src/test/regress/expected/uhc.out   | 86 +++++++++++++++++++++++++++++
 src/test/regress/expected/uhc_1.out |  6 ++
 src/test/regress/parallel_schedule  |  2 +-
 src/test/regress/sql/uhc.sql        | 36 ++++++++++++
 4 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 src/test/regress/expected/uhc.out
 create mode 100644 src/test/regress/expected/uhc_1.out
 create mode 100644 src/test/regress/sql/uhc.sql

diff --git a/src/test/regress/expected/uhc.out b/src/test/regress/expected/uhc.out
new file mode 100644
index 00000000000..d922cca7caf
--- /dev/null
+++ b/src/test/regress/expected/uhc.out
@@ -0,0 +1,86 @@
+-- This test is about UHC (Windows-949 / CP949) encoding.  UHC is a
+-- client-only encoding, so exercise pg_uhc_verifychar() via convert_from()
+-- in a UTF8 database.
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+-- valid: EUC_KR-compatible Hangul (U+AC00 "가")
+SELECT encode(convert_to(convert_from('\xb0a1', 'UHC'), 'UTF8'), 'hex');
+ encode 
+--------
+ eab080
+(1 row)
+
+-- valid: CP949 lead/trail boundary values
+SELECT encode(convert_to(convert_from('\x8141', 'UHC'), 'UTF8'), 'hex');	-- trail 0x41
+ encode 
+--------
+ eab082
+(1 row)
+
+SELECT encode(convert_to(convert_from('\x815a', 'UHC'), 'UTF8'), 'hex');	-- trail 0x5A
+ encode 
+--------
+ eab0b4
+(1 row)
+
+SELECT encode(convert_to(convert_from('\x8161', 'UHC'), 'UTF8'), 'hex');	-- trail 0x61
+ encode 
+--------
+ eab0b5
+(1 row)
+
+SELECT encode(convert_to(convert_from('\x817a', 'UHC'), 'UTF8'), 'hex');	-- trail 0x7A
+ encode 
+--------
+ eab195
+(1 row)
+
+SELECT encode(convert_to(convert_from('\x8181', 'UHC'), 'UTF8'), 'hex');	-- trail 0x81
+ encode 
+--------
+ eab196
+(1 row)
+
+SELECT encode(convert_to(convert_from('\x81fe', 'UHC'), 'UTF8'), 'hex');	-- trail 0xFE
+ encode 
+--------
+ eab493
+(1 row)
+
+SELECT encode(convert_to(convert_from('\xc7a1', 'UHC'), 'UTF8'), 'hex');	-- high lead 0xC7
+ encode 
+--------
+ ed9088
+(1 row)
+
+SELECT encode(convert_to(convert_from('\xfda1', 'UHC'), 'UTF8'), 'hex');	-- high lead 0xFD
+ encode 
+--------
+ e788bb
+(1 row)
+
+-- invalid lead byte (0x80 and 0xFF are unused in CP949)
+SELECT convert_from('\x8041', 'UHC');
+ERROR:  character with byte sequence 0x80 0x41 in encoding "UHC" has no equivalent in encoding "UTF8"
+SELECT convert_from('\xff41', 'UHC');
+ERROR:  character with byte sequence 0xff 0x41 in encoding "UHC" has no equivalent in encoding "UTF8"
+-- invalid trail byte
+SELECT convert_from('\x8140', 'UHC');	-- 0x40
+ERROR:  character with byte sequence 0x81 0x40 in encoding "UHC" has no equivalent in encoding "UTF8"
+SELECT convert_from('\x815b', 'UHC');	-- 0x5B
+ERROR:  character with byte sequence 0x81 0x5b in encoding "UHC" has no equivalent in encoding "UTF8"
+SELECT convert_from('\x8160', 'UHC');	-- 0x60
+ERROR:  character with byte sequence 0x81 0x60 in encoding "UHC" has no equivalent in encoding "UTF8"
+SELECT convert_from('\x817b', 'UHC');	-- 0x7B
+ERROR:  character with byte sequence 0x81 0x7b in encoding "UHC" has no equivalent in encoding "UTF8"
+SELECT convert_from('\x8180', 'UHC');	-- 0x80
+ERROR:  character with byte sequence 0x81 0x80 in encoding "UHC" has no equivalent in encoding "UTF8"
+SELECT convert_from('\x81ff', 'UHC');	-- 0xFF
+ERROR:  character with byte sequence 0x81 0xff in encoding "UHC" has no equivalent in encoding "UTF8"
+SELECT convert_from('\x8d20', 'UHC');	-- NONUTF8_INVALID sentinel pair
+ERROR:  invalid byte sequence for encoding "UHC": 0x8d 0x20
+-- truncated two-byte character
+SELECT convert_from('\x81', 'UHC');
+ERROR:  invalid byte sequence for encoding "UHC": 0x81
diff --git a/src/test/regress/expected/uhc_1.out b/src/test/regress/expected/uhc_1.out
new file mode 100644
index 00000000000..9deb8b8ee1d
--- /dev/null
+++ b/src/test/regress/expected/uhc_1.out
@@ -0,0 +1,6 @@
+-- This test is about UHC (Windows-949 / CP949) encoding.  UHC is a
+-- client-only encoding, so exercise pg_uhc_verifychar() via convert_from()
+-- in a UTF8 database.
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 8fa0a6c47fb..15d5e539961 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -28,7 +28,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t
 # geometry depends on point, lseg, line, box, path, polygon, circle
 # horology depends on date, time, timetz, timestamp, timestamptz, interval
 # ----------
-test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies oid8 encoding euc_kr
+test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies oid8 encoding euc_kr uhc
 
 # ----------
 # Load huge amounts of data
diff --git a/src/test/regress/sql/uhc.sql b/src/test/regress/sql/uhc.sql
new file mode 100644
index 00000000000..6905ad084b4
--- /dev/null
+++ b/src/test/regress/sql/uhc.sql
@@ -0,0 +1,36 @@
+-- This test is about UHC (Windows-949 / CP949) encoding.  UHC is a
+-- client-only encoding, so exercise pg_uhc_verifychar() via convert_from()
+-- in a UTF8 database.
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+-- valid: EUC_KR-compatible Hangul (U+AC00 "가")
+SELECT encode(convert_to(convert_from('\xb0a1', 'UHC'), 'UTF8'), 'hex');
+
+-- valid: CP949 lead/trail boundary values
+SELECT encode(convert_to(convert_from('\x8141', 'UHC'), 'UTF8'), 'hex');	-- trail 0x41
+SELECT encode(convert_to(convert_from('\x815a', 'UHC'), 'UTF8'), 'hex');	-- trail 0x5A
+SELECT encode(convert_to(convert_from('\x8161', 'UHC'), 'UTF8'), 'hex');	-- trail 0x61
+SELECT encode(convert_to(convert_from('\x817a', 'UHC'), 'UTF8'), 'hex');	-- trail 0x7A
+SELECT encode(convert_to(convert_from('\x8181', 'UHC'), 'UTF8'), 'hex');	-- trail 0x81
+SELECT encode(convert_to(convert_from('\x81fe', 'UHC'), 'UTF8'), 'hex');	-- trail 0xFE
+SELECT encode(convert_to(convert_from('\xc7a1', 'UHC'), 'UTF8'), 'hex');	-- high lead 0xC7
+SELECT encode(convert_to(convert_from('\xfda1', 'UHC'), 'UTF8'), 'hex');	-- high lead 0xFD
+
+-- invalid lead byte (0x80 and 0xFF are unused in CP949)
+SELECT convert_from('\x8041', 'UHC');
+SELECT convert_from('\xff41', 'UHC');
+
+-- invalid trail byte
+SELECT convert_from('\x8140', 'UHC');	-- 0x40
+SELECT convert_from('\x815b', 'UHC');	-- 0x5B
+SELECT convert_from('\x8160', 'UHC');	-- 0x60
+SELECT convert_from('\x817b', 'UHC');	-- 0x7B
+SELECT convert_from('\x8180', 'UHC');	-- 0x80
+SELECT convert_from('\x81ff', 'UHC');	-- 0xFF
+SELECT convert_from('\x8d20', 'UHC');	-- NONUTF8_INVALID sentinel pair
+
+-- truncated two-byte character
+SELECT convert_from('\x81', 'UHC');
-- 
2.43.0