From 7b3bc97b7dfae464321c8f4682f5eaf571214045 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Fri, 3 Mar 2017 14:25:09 +0900
Subject: [PATCH 09/10] Set of fixes for SASLprep

Some variable renames, as well as calculation of Hangul characters are
adjusted.

Per review from Kyotaro Horiguchi.
---
 src/common/scram-common.c |   8 +++-
 src/common/utf_norm.c     | 106 +++++++++++++++++++++++++---------------------
 2 files changed, 65 insertions(+), 49 deletions(-)

diff --git a/src/common/scram-common.c b/src/common/scram-common.c
index 041cf58f20..b262356325 100644
--- a/src/common/scram-common.c
+++ b/src/common/scram-common.c
@@ -121,7 +121,9 @@ pg_utf_mblen(const unsigned char *s)
  * Check validity of the given null-terminated string for UTF-8.
  *
  * This routine uses pg_utf_mblen() and pg_utf8_islegal() to check each
- * character of the string.
+ * character of the string. Strings made only of ASCII characters do not
+ * need to go through SASLprep, so let caller know as well in this case
+ * that the string is eligible in this case.
  */
 static bool
 pg_utf8_check_string(const unsigned char *source)
@@ -139,6 +141,10 @@ pg_utf8_check_string(const unsigned char *source)
 		p += l;
 	}
 
+	/* ASCII-only strings have no need to go through SASLprep */
+	if (l == strlen((const char*) source))
+		return false;
+
 	return true;
 }
 
diff --git a/src/common/utf_norm.c b/src/common/utf_norm.c
index c953b66e5f..2e7d6264fd 100644
--- a/src/common/utf_norm.c
+++ b/src/common/utf_norm.c
@@ -4,7 +4,8 @@
  *		Unicode strings (NFKC, NFKD, NFC and NFD).
  *
  * This contains the common low-level routines to perform normalizations
- * per documentation here: http://www.unicode.org/reports/tr15/.
+ * per documentation here: http://www.unicode.org/reports/tr15/, using the
+ * composition version 3.0.
  *
  * Portions Copyright (c) 2017, PostgreSQL Global Development Group
  *
@@ -23,6 +24,17 @@
 #include "common/utf_norm_table.h"
 #include "mb/pg_wchar.h"
 
+/* Constants for calculations wih Hangul characters */
+#define SBASE		0xAC00
+#define LBASE		0x1100
+#define VBASE		0x1161
+#define TBASE		0x11A7
+#define LCOUNT		19
+#define VCOUNT		21
+#define TCOUNT		28
+#define NCOUNT		VCOUNT * TCOUNT
+#define SCOUNT		LCOUNT * NCOUNT
+
 /*
  * utf_to_array
  *
@@ -298,14 +310,14 @@ get_decomposed_size(pg_wchar code)
 	 * See http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details
 	 * on the matter.
 	 */
-	if (code >= 0xAC00 && code < 0xD7A4)
+	if (code >= SBASE && code < SBASE + SCOUNT)
 	{
-		uint32	t, hindex;
+		uint32	tindex, sindex;
 
-		hindex = code - 0xAC00;
-		t = hindex % 28;
+		sindex = code - SBASE;
+		tindex = sindex % TCOUNT;
 
-		if (t != 0)
+		if (tindex != 0)
 			return 3;
 		return 2;
 	}
@@ -344,21 +356,21 @@ static bool
 recompose_code(uint32 start, uint32 code, uint32 *result)
 {
 	/* No need to care about ascii characters */
-	if (start <= 0xef || code <= 0xef)
+	if (start <= 0x7f || code <= 0x7f)
 		return false;
 
 	/* Hangul characters go here */
-	if (start >= 0x1100 && start < 0x1113 &&
-		code >= 0x1161 && code < 0x1176)
+	if (start >= LBASE && start < LBASE + LCOUNT &&
+		code >= VBASE && code < VBASE + VCOUNT)
 	{
-		*result = ((start - 0x1100) * 21 + code - 0x1161) * 28 + 0xAC00;
+		*result = ((start - LBASE) * VCOUNT + code - VBASE) * TCOUNT + SBASE;
 		return true;
 	}
-	else if (start >= 0xAC00 && start < 0xD7A4 &&
-			 !((start - 0xAC00) % 28) &&
-			 code >= 0x11A8 && code < 0x11C3)
+	else if (start >= SBASE && start < (SBASE + SCOUNT) &&
+			 ((start - SBASE) % TCOUNT) == 0 &&
+			 code >= TBASE && code < (TBASE + TCOUNT))
 	{
-		*result = start + code - 0x11A7;
+		*result = start + code - TBASE;
 		return true;
 	}
 	else
@@ -406,24 +418,24 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
 	 * See http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details
 	 * on the matter.
 	 */
-	if (code >= 0xAC00 && code < 0xD7A4)
+	if (code >= SBASE && code < SBASE + SCOUNT)
 	{
-		uint32	l, v, t, hindex;
+		uint32	l, v, tindex, sindex;
 		pg_wchar   *res = *result;
 
-		hindex = code - 0xAC00;
-		l = 0x1100 + hindex / (21 * 28);
-		v = 0x1161 + (hindex % (21 * 28)) / 28;
-		t = hindex % 28;
+		sindex = code - SBASE;
+		l = LBASE + sindex / (VCOUNT * TCOUNT);
+		v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
+		tindex = sindex % TCOUNT;
 
 		res[*current] = l;
 		(*current)++;
 		res[*current] = v;
 		(*current)++;
 
-		if (t != 0)
+		if (tindex != 0)
 		{
-			res[*current] = 0x11A7 + t;
+			res[*current] = TBASE + tindex;
 			(*current)++;
 		}
 
@@ -475,11 +487,10 @@ utf_sasl_prepare(const char *input)
 	int			count;
 	char	   *result;
 	/* variables for recomposition */
-	int			lastClass;
-	int			starterPos;
-	int			sourceLength;
-	int			targetPos;
-	uint32		starterCh;
+	int			last_class;
+	int			starter_pos;
+	int			target_pos;
+	uint32		starter_ch;
 
 	/* Convert input string into a manipulable array of character integers */
 	input_chars = utf_to_array((char *) input, &input_size);
@@ -527,8 +538,8 @@ utf_sasl_prepare(const char *input)
 	Assert(decomp_size == current_size);
 
 	/*
-	 * Now that the decomposition is done, apply the combining class
-	 * for each multibyte character.
+	 * Now end the decomposition by applying the combining class for
+	 * each multibyte character.
 	 */
 	for (count = 1; count < decomp_size; count++)
 	{
@@ -577,40 +588,39 @@ utf_sasl_prepare(const char *input)
 	 * make the allocation of the recomposed string based on that assumption.
 	 */
 	recomp_chars = (pg_wchar *) malloc(decomp_size * sizeof(int));
-	lastClass = -1;	 /* this eliminates a special check */
-	starterPos = 0;
-	sourceLength = decomp_size;
-	targetPos = 1;
-	starterCh = recomp_chars[0] = decomp_chars[0];
+	last_class = -1;	 /* this eliminates a special check */
+	starter_pos = 0;
+	target_pos = 1;
+	starter_ch = recomp_chars[0] = decomp_chars[0];
 
 	for (count = 1; count < decomp_size; count++)
 	{
 		pg_wchar	ch = decomp_chars[count];
-		pg_utf_decomposition *chEntry = get_code_entry(ch);
-		int			chClass = chEntry == NULL ? 0 : chEntry->class;
+		pg_utf_decomposition *ch_entry = get_code_entry(ch);
+		int			ch_class = ch_entry == NULL ? 0 : ch_entry->class;
 		pg_wchar	composite;
-		bool		found_match = recompose_code(starterCh, ch, &composite);
+		bool		found_match = recompose_code(starter_ch, ch, &composite);
 
-		if (found_match && lastClass < chClass)
+		if (found_match && last_class < ch_class)
 		{
-			recomp_chars[starterPos] = composite;
-			starterCh = composite;
+			recomp_chars[starter_pos] = composite;
+			starter_ch = composite;
 		}
-		else if (chClass == 0)
+		else if (ch_class == 0)
 		{
-			starterPos = targetPos;
-			starterCh  = ch;
-			lastClass  = -1;
-			recomp_chars[targetPos++] = ch;
+			starter_pos = target_pos;
+			starter_ch  = ch;
+			last_class  = -1;
+			recomp_chars[target_pos++] = ch;
 		}
 		else
 		{
-			lastClass = chClass;
-			recomp_chars[targetPos++] = ch;
+			last_class = ch_class;
+			recomp_chars[target_pos++] = ch;
 		}
 	}
 
-	recomp_size = targetPos;
+	recomp_size = target_pos;
 
 	/*
 	 * Convert the decomposition back to a string, which is the final
-- 
2.12.0