[Mono-dev] eglib patch to implement some unicode stuff

Atsushi Eno Fri, 31 Oct 2008 16:20:33 -0700

Hello,

Here is a patch to implement following unicode stuff in eglib,
as well as a couple of surrogate handling fixes in existing stuff:


- g_unichar_type()
- g_unichar_toupper()
- g_unichar_tolower()
- g_unichar_totitle()
- g_utf8_strup()
- g_utf8_strdown()

It is with a table generator that consumes Unicode Character Database
(UCD) which I plan to put probably under mono/tools/eglib-ucd or
whatever (or mcs/class/corlib/Mono.Globalization.Unicode, to which I
originally planned to put). It is to generate unicode-data.h (gzipped).

If nothing looks wrong I'll commit next week. Please review :)

Atsushi Eno

//
// UCD.cs
//
// Author:
//	Atsushi Enomoto  <[EMAIL PROTECTED]>
//
// Copyright (C) 2008 Novell, Inc.
//

//
// Unicode table generator for eglib.
// Note that this code is only for Unicode 5.1.0 or earlier.
// (regarding character ranges)
//
// Some premises:
// - lower-band (0000-FFFF) characters never has case mapping to higher-band
//   characters. Hence, simple upper/lower mapping is divided into 16-bit and
//   32-bit tables.
//

using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Reflection;

namespace Mono.Globalization.Unicode
{
	public class Driver
	{
		public static void Main (string [] args)
		{
			TextWriter w = Console.Out;
			w.NewLine = "\n";

			w.WriteLine (@"/*
This file is automatically generated by {0}.exe.
The source for this generator should be in Mono repository
(mcs/class/corlib/Mono.Globalization.Unicode directory).
*/

#ifndef __UNICODE_DATA_H
#define __UNICODE_DATA_H

#include <glib.h>

", Assembly.GetEntryAssembly ().GetName ().Name);
			var ud = new UnicodeData5_1_0 ();
			var ucd = ud.ParseFile (args [0]);
			var ucg = new UnicodeDataCodeGeneratorC5_1_0 (ud, w);
			ucg.GenerateStructures ();
			w.WriteLine ();
			ucg.GenerateUnicodeCategoryListC (ucd);
			w.WriteLine ();
			ucg.GenerateSimpleCaseMappingListC (ucd);
			w.WriteLine ();
			ucg.GenerateSimpleTitlecaseMappingListC (ucd);
			w.WriteLine (@"
#endif
");
		}
	}

	public class UnicodeData5_1_0 : UnicodeData
	{
		public override CodePointRange [] SimpleCases {
			get { return simple_cases; }
		}

		public override CodePointRange [] CategoryRanges {
			get { return category_ranges; }
		}

		static readonly CodePointRange [] simple_cases = {
			new CodePointRange (0x0040, 0x0600),
			new CodePointRange (0x1000, 0x10D0),
			new CodePointRange (0x1D00, 0x2000),
			new CodePointRange (0x2100, 0x21C0),
			new CodePointRange (0x2480, 0x2500),
			new CodePointRange (0x2C00, 0x2D80),
			new CodePointRange (0xA640, 0xA7C0),
			new CodePointRange (0xFF20, 0xFF80),
			new CodePointRange (0x10400, 0x10480),
			};

		static readonly CodePointRange [] category_ranges = {
			new CodePointRange (0x0000, 0x3400),
			// 3400-4DB5: OtherLetter
			new CodePointRange (0x4DC0, 0x4E00),
			// 4E00-9FC3: OtherLetter
			new CodePointRange (0xA000, 0xAA80),
			// AC00-D7A3: OtherLetter
			// D800-DFFF: OtherSurrogate
			// E000-F8FF: OtherPrivateUse
			new CodePointRange (0xF900, 0x10000),
			new CodePointRange (0x10000, 0x104C0),
			new CodePointRange (0x10800, 0x10A80),
			new CodePointRange (0x12000, 0x12480),
			new CodePointRange (0x1D000, 0x1D800),
			new CodePointRange (0x1F000, 0x1F0C0),
			// 20000-2A6D6 OtherLetter
			new CodePointRange (0x2F800, 0x2FA40),
			new CodePointRange (0xE0000, 0xE0200),
			// F0000-FFFFD OtherPrivateUse
			// 100000-10FFFD OtherPrivateUse
			};
	}

	public abstract class UnicodeData
	{
		public abstract CodePointRange [] SimpleCases { get; }

		public abstract CodePointRange [] CategoryRanges { get; }

		public virtual UcdCharacterProperty [] ParseFile (string file)
		{
			var d = new List<KeyValuePair<int,UcdCharacterProperty>> ();

			using (TextReader r = File.OpenText (file)) {
				while (r.Peek () >= 0) {
					var l = r.ReadLine ();
					if (l.Length > 0 && l [0] != '#') {
						var u = Parse (l);
						d.Add (new KeyValuePair<int,UcdCharacterProperty> (u.Codepoint, u));
					}
				}
			}
			var list = new List<UcdCharacterProperty> ();
			foreach (var p in d)
				list.Add (p.Value);
			return list.ToArray ();
		}

		UcdCharacterProperty Parse (string line)
		{
			string [] tokens = line.Split (';');
			string [] decomp = tokens [5].Length > 0 ? tokens [5].Split (' ') : null;
			string decomp_type = decomp != null && decomp [0] [0] == '<' ? decomp [0] : null;
			if (decomp_type != null) {
				for (int i = 1; i < decomp.Length; i++)
					decomp [i - 1] = decomp [i];
				Array.Resize (ref decomp, decomp.Length - 1);
			}

			return new UcdCharacterProperty () {
				Codepoint = int.Parse (tokens [0], NumberStyles.HexNumber),
				Name = tokens [1],
				Category = ParseUnicodeCategory (tokens [2]),
				CanonicalCombiningClass = tokens [3].Length > 0 ? (byte?) byte.Parse (tokens [3]) : null,
				BidiClass = tokens [4].Length > 0 ? (UcdBidiClass) Enum.Parse (typeof (UcdBidiClass), tokens [4]) : UcdBidiClass.None,
				DecompositionType = decomp_type != null ? ParseDecompositionType (decomp_type) : UcdDecompositionType.None,
				DecompositionMapping = decomp != null ? Array.ConvertAll<string,int> (decomp, dv => int.Parse (dv, NumberStyles.HexNumber)) : null,
				DecimalDigitValue = tokens [6],
				DigitValue = tokens [7],
				NumericValue = tokens [8],
				BidiMirrored = (tokens [9] == "Y"),
				Unicode1Name = tokens [10],
				IsoComment = tokens [11],
				SimpleUppercaseMapping = tokens [12].Length > 0 ? int.Parse (tokens [12], NumberStyles.HexNumber) : 0,
				SimpleLowercaseMapping = tokens [13].Length > 0 ? int.Parse (tokens [13], NumberStyles.HexNumber) : 0,
				SimpleTitlecaseMapping = tokens [14].Length > 0 ? int.Parse (tokens [14], NumberStyles.HexNumber) : 0,
				};
		}

		UcdDecompositionType ParseDecompositionType (string s)
		{
			switch (s) {
			case "<font>":
				return UcdDecompositionType.Font;
			case "<noBreak>":
				return UcdDecompositionType.NoBreak;
			case "<initial>":
				return UcdDecompositionType.Initial;
			case "<medial>":
				return UcdDecompositionType.Medial;
			case "<final>":
				return UcdDecompositionType.Final;
			case "<isolated>":
				return UcdDecompositionType.Isolated;
			case "<circle>":
				return UcdDecompositionType.Circle;
			case "<super>":
				return UcdDecompositionType.Super;
			case "<sub>":
				return UcdDecompositionType.Sub;
			case "<vertical>":
				return UcdDecompositionType.Vertical;
			case "<wide>":
				return UcdDecompositionType.Wide;
			case "<narrow>":
				return UcdDecompositionType.Narrow;
			case "<small>":
				return UcdDecompositionType.Small;
			case "<square>":
				return UcdDecompositionType.Square;
			case "<fraction>":
				return UcdDecompositionType.Fraction;
			case "<compat>":
				return UcdDecompositionType.Compat;
			}
			throw new ArgumentException (String.Format ("Unexpected decomposition type '{0}'", s));
		}

		UnicodeCategory ParseUnicodeCategory (string s)
		{
			switch (s) {
			case "Lu":
				return UnicodeCategory.UppercaseLetter;
			case "Ll":
				return UnicodeCategory.LowercaseLetter;
			case "Lt":
				return UnicodeCategory.TitlecaseLetter;
			case "Lm":
				return UnicodeCategory.ModifierLetter;
			case "Lo":
				return UnicodeCategory.OtherLetter;
			case "Mn":
				return UnicodeCategory.NonSpacingMark;
			case "Mc":
				return UnicodeCategory.SpacingCombiningMark;
			case "Me":
				return UnicodeCategory.EnclosingMark;
			case "Nd":
				return UnicodeCategory.DecimalDigitNumber;
			case "Nl":
				return UnicodeCategory.LetterNumber;
			case "No":
				return UnicodeCategory.OtherNumber;
			case "Pc":
				return UnicodeCategory.ConnectorPunctuation;
			case "Pd":
				return UnicodeCategory.DashPunctuation;
			case "Ps":
				return UnicodeCategory.OpenPunctuation;
			case "Pe":
				return UnicodeCategory.ClosePunctuation;
			case "Pi":
				return UnicodeCategory.InitialQuotePunctuation;
			case "Pf":
				return UnicodeCategory.FinalQuotePunctuation;
			case "Po":
				return UnicodeCategory.OtherPunctuation;
			case "Sm":
				return UnicodeCategory.MathSymbol;
			case "Sc":
				return UnicodeCategory.CurrencySymbol;
			case "Sk":
				return UnicodeCategory.ModifierSymbol;
			case "So":
				return UnicodeCategory.OtherSymbol;
			case "Zs":
				return UnicodeCategory.SpaceSeparator;
			case "Zl":
				return UnicodeCategory.LineSeparator;
			case "Zp":
				return UnicodeCategory.ParagraphSeparator;
			case "Cc":
				return UnicodeCategory.Control;
			case "Cf":
				return UnicodeCategory.Format;
			case "Cs":
				return UnicodeCategory.Surrogate;
			case "Co":
				return UnicodeCategory.PrivateUse;
			case "Cn":
				return UnicodeCategory.OtherNotAssigned;
			}
			throw new ArgumentException (String.Format ("Unexpected category {0}", s));
		}
	}

	public class UnicodeDataCodeGeneratorC5_1_0
	{
		UnicodeData catalog;
		TextWriter w;

		public UnicodeDataCodeGeneratorC5_1_0 (UnicodeData catalog, TextWriter writer)
		{
			this.catalog = catalog;
			w = writer;
		}

		public void GenerateStructures ()
		{
			w.WriteLine ("/* ======== Structures ======== */");
			w.WriteLine (@"typedef struct {
	guint32 codepoint;
	guint32 upper;
	guint32 title;
} SimpleTitlecaseMapping;");
			w.WriteLine (@"typedef struct {
	guint32 start;
	guint32 end;
} CodePointRange;");
			w.WriteLine (@"typedef struct {
	guint32 upper;
	guint32 lower;
} SimpleCaseMapping;");
		}

		void GenerateCodePointRanges (string name, CodePointRange [] ranges)
		{
			w.WriteLine ("static const guint8 {0}_count = {1};", name, ranges.Length);
			w.WriteLine ("static const CodePointRange {0} [] = {{", name);
			foreach (var cpr in ranges)
				w.WriteLine ("{{0x{0:X06}, 0x{1:X06}}},", cpr.Start, cpr.End);
			w.WriteLine ("{0, 0}};");
		}

		public void GenerateUnicodeCategoryListC (UcdCharacterProperty [] ucd)
		{
			w.WriteLine ("/* ======== Unicode Categories ======== */");
			GenerateCodePointRanges ("unicode_category_ranges", catalog.CategoryRanges);

			int table = 0;
			foreach (var cpr in catalog.CategoryRanges) {
				w.WriteLine ("const GUnicodeType unicode_category_table{0} [] = {{", table);
				w.WriteLine ("\t/* ==== {0:X}-{1:X} ==== */", cpr.Start, cpr.End);
				w.Write ("\t");
				int cp = cpr.Start;
				foreach (var ucp in ucd) {
					if (ucp.Codepoint >= cpr.End)
						break;
					if (ucp.Codepoint < cp)
						continue;
					while (cp < ucp.Codepoint) {
						w.Write ("0,");
						if (++cp % 16 == 0)
//							w.Write ("\n/* ==== {0:X} ==== */\n\t", cp);
							w.Write ("\n\t", cp);
					}
					w.Write ((int) ToGUnicodeCategory (ucp.Category));
					w.Write (',');
					if (++cp % 16 == 0)
//						w.Write ("\n/* ==== {0:X} ==== */\n\t", cp);
						w.Write ("\n\t", cp);
					if (cp >= cpr.End)
						break;
				}
				w.WriteLine ("0};");
				table++;
			}

			w.WriteLine ("static const GUnicodeType *unicode_category [{0}]  = {{", catalog.CategoryRanges.Length);
			for (int i = 0, end = catalog.CategoryRanges.Length; i < end; i++)
				w.WriteLine ("\tunicode_category_table{0}{1}", i, i + 1 < end ? "," : String.Empty);
			w.WriteLine ("};");
		}

		public void GenerateSimpleTitlecaseMappingListC (UcdCharacterProperty [] ucd)
		{
			w.WriteLine ("static const SimpleTitlecaseMapping simple_titlecase_mapping [] = {");
			int count = 0;
			foreach (var ucp in ucd) {
				if (ucp.SimpleUppercaseMapping == ucp.SimpleTitlecaseMapping)
					continue;
				if (count > 0)
					w.WriteLine (',');
				w.Write ("\t{{0x{0:X06}, 0x{1:X06}, 0x{2:X06}}}", ucp.Codepoint, ucp.SimpleUppercaseMapping, ucp.SimpleTitlecaseMapping);
				count++;
			}
			w.WriteLine ();
			w.WriteLine ("};");
			w.WriteLine ("static const guint8 simple_titlecase_mapping_count = {0};", count);
		}

		public void GenerateSimpleCaseMappingListC (UcdCharacterProperty [] ucd)
		{
			GenerateCodePointRanges ("simple_case_map_ranges", catalog.SimpleCases);
			GenerateSimpleCaseMappingListC (ucd, true, true);
			GenerateSimpleCaseMappingListC (ucd, true, false);
			GenerateSimpleCaseMappingListC (ucd, false, true);
			GenerateSimpleCaseMappingListC (ucd, false, false);
		}

		void GenerateSimpleCaseMappingListC (UcdCharacterProperty [] ucd, bool upper, bool small)
		{
			int nTable = 0;
			foreach (var cpr in catalog.SimpleCases) {
				if (small && cpr.Start > 0xFFFF)
					break;
				if (!small && cpr.Start < 0x10000)
					continue;

				w.WriteLine ("static const {0} simple_{1}_case_mapping_{2}_table{3} [] = {{", small ? "guint16" : "guint32", upper ? "upper" : "lower", small ? "lowarea" : "higharea", nTable);


				w.WriteLine ("\t/* ==== {0:X}-{1:X} ==== */", cpr.Start, cpr.End);
				w.Write ("\t");
				int cp = cpr.Start;
				foreach (var ucp in ucd) {
					if (ucp.Codepoint >= cpr.End)
						break;
					if (ucp.Codepoint < cp)
						continue;
					while (cp < ucp.Codepoint) {
						w.Write ("0,");
						if (++cp % 16 == 0)
							w.WriteLine ();
					}
					int v = upper ? ucp.SimpleUppercaseMapping : ucp.SimpleLowercaseMapping;
					if (v != 0)
						w.Write ("0x{0:X},", v);
					else
						w.Write ("0,");

					if (++cp % 16 == 0) {
						w.WriteLine ();
						w.Write ("\t");
					}
					if (cp >= cpr.End)
						break;
				}
				w.WriteLine ("0};");

				nTable++;
			}

			w.WriteLine ("static const {0} *simple_{1}_case_mapping_{2} [] = {{", small ? "guint16" : "guint32", upper ? "upper" : "lower", small ? "lowarea" : "higharea");

			for (int i = 0; i < nTable; i++) {
				if (i > 0)
					w.WriteLine (",");
				w.Write ("\tsimple_{1}_case_mapping_{2}_table{3}", small ? "guint16" : "guint32", upper ? "upper" : "lower", small ? "lowarea" : "higharea", i);
			}

			w.WriteLine ("};");
			w.WriteLine ();
		}

		enum GUnicodeType
		{
			G_UNICODE_CONTROL,
			G_UNICODE_FORMAT,
			G_UNICODE_UNASSIGNED,
			G_UNICODE_PRIVATE_USE,
			G_UNICODE_SURROGATE,
			G_UNICODE_LOWERCASE_LETTER,
			G_UNICODE_MODIFIER_LETTER,
			G_UNICODE_OTHER_LETTER,
			G_UNICODE_TITLECASE_LETTER,
			G_UNICODE_UPPERCASE_LETTER,
			G_UNICODE_COMBINING_MARK,
			G_UNICODE_ENCLOSING_MARK,
			G_UNICODE_NON_SPACING_MARK,
			G_UNICODE_DECIMAL_NUMBER,
			G_UNICODE_LETTER_NUMBER,
			G_UNICODE_OTHER_NUMBER,
			G_UNICODE_CONNECT_PUNCTUATION,
			G_UNICODE_DASH_PUNCTUATION,
			G_UNICODE_CLOSE_PUNCTUATION,
			G_UNICODE_FINAL_PUNCTUATION,
			G_UNICODE_INITIAL_PUNCTUATION,
			G_UNICODE_OTHER_PUNCTUATION,
			G_UNICODE_OPEN_PUNCTUATION,
			G_UNICODE_CURRENCY_SYMBOL,
			G_UNICODE_MODIFIER_SYMBOL,
			G_UNICODE_MATH_SYMBOL,
			G_UNICODE_OTHER_SYMBOL,
			G_UNICODE_LINE_SEPARATOR,
			G_UNICODE_PARAGRAPH_SEPARATOR,
			G_UNICODE_SPACE_SEPARATOR
		}

		GUnicodeType ToGUnicodeCategory (UnicodeCategory v)
		{
			switch (v) {
			case UnicodeCategory.UppercaseLetter:
				return GUnicodeType.G_UNICODE_UPPERCASE_LETTER;
			case UnicodeCategory.LowercaseLetter:
				return GUnicodeType.G_UNICODE_LOWERCASE_LETTER;
			case UnicodeCategory.TitlecaseLetter:
				return GUnicodeType.G_UNICODE_TITLECASE_LETTER;
			case UnicodeCategory.ModifierLetter:
				return GUnicodeType.G_UNICODE_MODIFIER_LETTER;
			case UnicodeCategory.OtherLetter:
				return GUnicodeType.G_UNICODE_OTHER_LETTER;
			case UnicodeCategory.NonSpacingMark:
				return GUnicodeType.G_UNICODE_NON_SPACING_MARK;
			case UnicodeCategory.SpacingCombiningMark:
				return GUnicodeType.G_UNICODE_COMBINING_MARK;
			case UnicodeCategory.EnclosingMark:
				return GUnicodeType.G_UNICODE_ENCLOSING_MARK;
			case UnicodeCategory.DecimalDigitNumber:
				return GUnicodeType.G_UNICODE_DECIMAL_NUMBER;
			case UnicodeCategory.LetterNumber:
				return GUnicodeType.G_UNICODE_LETTER_NUMBER;
			case UnicodeCategory.OtherNumber:
				return GUnicodeType.G_UNICODE_OTHER_NUMBER;
			case UnicodeCategory.ConnectorPunctuation:
				return GUnicodeType.G_UNICODE_CONNECT_PUNCTUATION;
			case UnicodeCategory.DashPunctuation:
				return GUnicodeType.G_UNICODE_DASH_PUNCTUATION;
			case UnicodeCategory.OpenPunctuation:
				return GUnicodeType.G_UNICODE_OPEN_PUNCTUATION;
			case UnicodeCategory.ClosePunctuation:
				return GUnicodeType.G_UNICODE_CLOSE_PUNCTUATION;
			case UnicodeCategory.InitialQuotePunctuation:
				return GUnicodeType.G_UNICODE_INITIAL_PUNCTUATION;
			case UnicodeCategory.FinalQuotePunctuation:
				return GUnicodeType.G_UNICODE_FINAL_PUNCTUATION;
			case UnicodeCategory.OtherPunctuation:
				return GUnicodeType.G_UNICODE_OTHER_PUNCTUATION;
			case UnicodeCategory.MathSymbol:
				return GUnicodeType.G_UNICODE_MATH_SYMBOL;
			case UnicodeCategory.CurrencySymbol:
				return GUnicodeType.G_UNICODE_CURRENCY_SYMBOL;
			case UnicodeCategory.ModifierSymbol:
				return GUnicodeType.G_UNICODE_MODIFIER_SYMBOL;
			case UnicodeCategory.OtherSymbol:
				return GUnicodeType.G_UNICODE_OTHER_SYMBOL;
			case UnicodeCategory.SpaceSeparator:
				return GUnicodeType.G_UNICODE_SPACE_SEPARATOR;
			case UnicodeCategory.LineSeparator:
				return GUnicodeType.G_UNICODE_LINE_SEPARATOR;
			case UnicodeCategory.ParagraphSeparator:
				return GUnicodeType.G_UNICODE_PARAGRAPH_SEPARATOR;
			case UnicodeCategory.Control:
				return GUnicodeType.G_UNICODE_CONTROL;
			case UnicodeCategory.Format:
				return GUnicodeType.G_UNICODE_FORMAT;
			case UnicodeCategory.Surrogate:
				return GUnicodeType.G_UNICODE_SURROGATE;
			case UnicodeCategory.PrivateUse:
				return GUnicodeType.G_UNICODE_PRIVATE_USE;
			case UnicodeCategory.OtherNotAssigned:
				return GUnicodeType.G_UNICODE_UNASSIGNED;
			}
			throw new ArgumentException (String.Format ("Unexpected category {0}", v));
		}
	}

	public class CodePointRange
	{
		public CodePointRange (int start, int end)
		{
			Start = start;
			End = end;
		}

		public int Start { get; set; }
		public int End { get; set; }
	}

	public class UcdCharacterProperty
	{
		public int Codepoint { get; set; }
		public string Name { get; set; }
		public UnicodeCategory Category { get; set; }
		public byte? CanonicalCombiningClass { get; set; }
		public UcdBidiClass BidiClass { get; set; }
		public UcdDecompositionType DecompositionType { get; set; }
		public int [] DecompositionMapping { get; set; }
		public string DecimalDigitValue { get; set; }
		public string DigitValue { get; set; }
		public string NumericValue { get; set; }
		public bool BidiMirrored { get; set; }
		public string Unicode1Name { get; set; }
		public string IsoComment { get; set; }
		public int SimpleUppercaseMapping { get; set; }
		public int SimpleLowercaseMapping { get; set; }
		public int SimpleTitlecaseMapping { get; set; }
	}

	public enum UcdBidiClass
	{
		None,
		L,
		LRE,
		LRO,
		R,
		AL,
		RLE,
		RLO,
		PDF,
		EN,
		ES,
		ET,
		AN,
		CS,
		NSM,
		BN,
		B,
		S,
		WS,
		ON
	}

	public enum UcdDecompositionType
	{
		None,
		Font,
		NoBreak,
		Initial,
		Medial,
		Final,
		Isolated,
		Circle,
		Super,
		Sub,
		Vertical,
		Wide,
		Narrow,
		Small,
		Square,
		Fraction,
		Compat
	}
}

Index: test/unicode.c
===================================================================
--- test/unicode.c	(revision 0)
+++ test/unicode.c	(revision 0)
@@ -0,0 +1,99 @@
+#include "test.h"
+
+/*
+ * g_unichar_type
+ */
+RESULT
+test_g_unichar_type ()
+{
+	if (g_unichar_type ('A') != G_UNICODE_UPPERCASE_LETTER)
+		return FAILED ("#1");
+	if (g_unichar_type ('a') != G_UNICODE_LOWERCASE_LETTER)
+		return FAILED ("#2");
+	if (g_unichar_type ('1') != G_UNICODE_DECIMAL_NUMBER)
+		return FAILED ("#3");
+	if (g_unichar_type (0xA3) != G_UNICODE_CURRENCY_SYMBOL)
+		return FAILED ("#4");
+	return NULL;
+}
+
+/*
+ * g_unichar_toupper
+ */
+RESULT
+test_g_unichar_toupper ()
+{
+	if (g_unichar_toupper (0) != 0)
+		return FAILED ("#0");
+	if (g_unichar_toupper ('a') != 'A')
+		return FAILED ("#1");
+	if (g_unichar_toupper ('1') != '1')
+		return FAILED ("#2");
+	if (g_unichar_toupper (0x1C4) != 0x1C4)
+		return FAILED ("#3");
+	if (g_unichar_toupper (0x1F2) != 0x1F1)
+		return FAILED ("#4");
+	if (g_unichar_toupper (0x1F3) != 0x1F1)
+		return FAILED ("#5");
+	if (g_unichar_toupper (0xFFFF) != 0xFFFF)
+		return FAILED ("#6");
+	if (g_unichar_toupper (0x10428) != 0x10400)
+		return FAILED ("#7");
+	return NULL;
+}
+
+/*
+ * g_unichar_tolower
+ */
+RESULT
+test_g_unichar_tolower ()
+{
+	if (g_unichar_tolower (0) != 0)
+		return FAILED ("#0");
+	if (g_unichar_tolower ('A') != 'a')
+		return FAILED ("#1");
+	if (g_unichar_tolower ('1') != '1')
+		return FAILED ("#2");
+	if (g_unichar_tolower (0x1C5) != 0x1C6)
+		return FAILED ("#3");
+	if (g_unichar_tolower (0x1F1) != 0x1F3)
+		return FAILED ("#4");
+	if (g_unichar_tolower (0x1F2) != 0x1F3)
+		return FAILED ("#5");
+	if (g_unichar_tolower (0xFFFF) != 0xFFFF)
+		return FAILED ("#6");
+	return NULL;
+}
+
+/*
+ * g_unichar_totitle
+ */
+RESULT
+test_g_unichar_totitle ()
+{
+	if (g_unichar_toupper (0) != 0)
+		return FAILED ("#0");
+	if (g_unichar_totitle ('a') != 'A')
+		return FAILED ("#1");
+	if (g_unichar_totitle ('1') != '1')
+		return FAILED ("#2");
+	if (g_unichar_totitle (0x1C4) != 0x1C5)
+		return FAILED ("#3");
+	if (g_unichar_totitle (0x1F2) != 0x1F2)
+		return FAILED ("#4");
+	if (g_unichar_totitle (0x1F3) != 0x1F2)
+		return FAILED ("#5");
+	if (g_unichar_toupper (0xFFFF) != 0xFFFF)
+		return FAILED ("#6");
+	return NULL;
+}
+
+static Test unicode_tests [] = {
+	{"g_unichar_type", test_g_unichar_type},
+	{"g_unichar_toupper", test_g_unichar_toupper},
+	{"g_unichar_tolower", test_g_unichar_tolower},
+	{"g_unichar_totitle", test_g_unichar_totitle},
+	{NULL, NULL}
+};
+
+DEFINE_TEST_GROUP_INIT(unicode_tests_init, unicode_tests)
Index: test/tests.h
===================================================================
--- test/tests.h	(revision 117383)
+++ test/tests.h	(working copy)
@@ -18,6 +18,7 @@
 DEFINE_TEST_GROUP_INIT_H(pattern_tests_init);
 DEFINE_TEST_GROUP_INIT_H(dir_tests_init);
 DEFINE_TEST_GROUP_INIT_H(markup_tests_init);
+DEFINE_TEST_GROUP_INIT_H(unicode_tests_init);
 DEFINE_TEST_GROUP_INIT_H(utf8_tests_init);
 DEFINE_TEST_GROUP_INIT_H(endian_tests_init);
 DEFINE_TEST_GROUP_INIT_H(module_tests_init);
@@ -42,6 +43,7 @@
 	{"file",      file_tests_init},
 	{"pattern",   pattern_tests_init},
 	{"dir",       dir_tests_init},
+	{"unicode",   unicode_tests_init},
 	{"utf8",      utf8_tests_init},
 	{"endian",    endian_tests_init},
 	{"module",    module_tests_init},
Index: test/utf8.c
===================================================================
--- test/utf8.c	(revision 117383)
+++ test/utf8.c	(working copy)
@@ -82,8 +82,8 @@
 RESULT
 test_utf16_to_utf8 ()
 {
-	const gchar *src0 = "", *src1 = "ABCDE", *src2 = "\xE5\xB9\xB4\x27";
-	gunichar2 str0 [] = {0}, str1 [6], str2 [] = {0x5E74, 39, 0};
+	const gchar *src0 = "", *src1 = "ABCDE", *src2 = "\xE5\xB9\xB4\x27", *src3 = "\xEF\xBC\xA1", *src4 = "\xEF\xBD\x81", *src5 = "\xF0\x90\x90\x80";
+	gunichar2 str0 [] = {0}, str1 [6], str2 [] = {0x5E74, 39, 0}, str3 [] = {0xFF21, 0}, str4 [] = {0xFF41, 0}, str5 [] = {0xD801, 0xDC00, 0};
 	RESULT result;
 
 	gchar_to_gunichar2 (str1, src1);
@@ -99,6 +99,15 @@
 	result = compare_utf16_to_utf8 (src2, str2, 2, 4);
 	if (result != OK)
 		return result;
+	result = compare_utf16_to_utf8 (src3, str3, 1, 3);
+	if (result != OK)
+		return result;
+	result = compare_utf16_to_utf8 (src4, str4, 1, 3);
+	if (result != OK)
+		return result;
+	result = compare_utf16_to_utf8 (src5, str5, 2, 4);
+	if (result != OK)
+		return result;
 
 	return OK;
 }
@@ -194,6 +203,7 @@
 	if (out_read != 2) {
 		return FAILED ("out_read is expected to be 2 but was %d\n", out_read);
 	}
+	g_free (dst);
 
 	return OK;
 }
@@ -201,8 +211,8 @@
 RESULT
 test_utf8_to_utf16 ()
 {
-	const gchar *src0 = "", *src1 = "ABCDE", *src2 = "\xE5\xB9\xB4\x27";
-	gunichar2 str0 [] = {0}, str1 [6], str2 [] = {0x5E74, 39, 0};
+	const gchar *src0 = "", *src1 = "ABCDE", *src2 = "\xE5\xB9\xB4\x27", *src3 = "\xEF\xBC\xA1", *src4 = "\xEF\xBD\x81";
+	gunichar2 str0 [] = {0}, str1 [6], str2 [] = {0x5E74, 39, 0}, str3 [] = {0xFF21, 0}, str4 [] = {0xFF41, 0};
 	RESULT result;
 
 	gchar_to_gunichar2 (str1, src1);
@@ -218,6 +228,12 @@
 	result = compare_utf8_to_utf16 (str2, src2, 4, 2);
 	if (result != OK)
 		return result;
+	result = compare_utf8_to_utf16 (str3, src3, 3, 1);
+	if (result != OK)
+		return result;
+	result = compare_utf8_to_utf16 (str4, src4, 3, 1);
+	if (result != OK)
+		return result;
 
 	return OK;
 }
@@ -310,6 +326,8 @@
 	static gunichar2 exp4[4] = {'h',0xdbff,0xdfff,'\0'};
 	static gunichar str5[7] = {0xD7FF,0xD800,0xDFFF,0xE000,0x110000,0x10FFFF,'\0'};
 	static gunichar2 exp5[5] = {0xD7FF,0xE000,0xdbff,0xdfff,'\0'};
+	static gunichar str6[2] = {0x10400, '\0'};
+	static gunichar2 exp6[3] = {0xD801, 0xDC00, '\0'};
 	static glong read_write[12] = {1,1,0,0,0,0,1,1,0,0,1,2};
 	gunichar2* res;
 	glong items_read, items_written, current_write_index;
@@ -337,8 +355,8 @@
 	items_read = items_written = 0;
 	res = g_ucs4_to_utf16 (str2, 2, &items_read, &items_written, &err);
 	check_result = ucs4_to_utf16_check_result (res, 0, items_read, 1, items_written, 0, err, TRUE);
+	g_free (res);
 	if (check_result) return check_result;
-	g_free (res);
 
 	items_read = items_written = 0;
 	err = 0;
@@ -367,6 +385,13 @@
 		current_write_index += items_written;
 	}
 
+	items_read = items_written = 0;
+	err = 0;
+	res = g_ucs4_to_utf16 (str6, 1, &items_read, &items_written, &err);
+	check_result = ucs4_to_utf16_check_result (res, exp6, items_read, 1, items_written, 2, err, FALSE);
+	if (check_result) return check_result;
+	g_free (res);
+
 	return OK;
 }
 
@@ -411,6 +436,8 @@
 	static gunichar2 str4[20] = {0xDC00,0xDFFF,0xDFF,0xD800,0xDBFF,0xD800,0xDC00,0xD800,0xDFFF,
 				     0xD800,0xE000,0xDBFF,0xDBFF,0xDBFF,0xDC00,0xDBFF,0xDFFF,0xDBFF,0xE000,'\0'};
 	static gunichar exp4[6] = {0xDFF,0x10000,0x103ff,0x10fc00,0x10FFFF,'\0'};
+	static gunichar2 str5[3] = {0xD801, 0xDC00, 0};
+	static gunichar exp5[2] = {0x10400, 0};
 	static glong read_write[33] = {1,0,0,1,0,0,1,1,1,2,1,0,2,2,1,2,2,1,2,1,0,2,1,0,2,2,1,2,2,1,2,1,0};
 	gunichar* res;
 	glong items_read, items_written, current_read_index,current_write_index;
@@ -481,6 +508,13 @@
 		current_write_index += items_written;
 	}
 
+	items_read = items_written = 0;
+	err = 0;
+	res = g_utf16_to_ucs4 (str5, 2, &items_read, &items_written, &err);
+	check_result = utf16_to_ucs4_check_result (res, exp5, items_read, 2, items_written, 1, err, FALSE);
+	if (check_result) return check_result;
+	g_free (res);
+
 	return OK;
 }
 RESULT
@@ -636,7 +670,102 @@
 	return OK;
 }
 
+glong
+utf8_byteslen (const gchar *src)
+{
+	int i = 0;
+	do {
+		if (src [i] == '\0')
+			return i;
+		i++;
+	} while (TRUE);
+}
+
+RESULT
+test_utf8_strcase_each (const gchar *src, const gchar *expected, gboolean strup)
+{
+	gchar *tmp;
+	glong len, len2;
+	RESULT r;
+
+	len = utf8_byteslen (src);
+	tmp = strup ? g_utf8_strup (src, len) : g_utf8_strdown (src, len);
+	len2 = utf8_byteslen (tmp);
+	r = compare_strings_utf8_RESULT (expected, tmp, len < len2 ? len2 : len);
+	g_free (tmp);
+	return r;
+}
+
+RESULT
+test_utf8_strup_each (const gchar *src, const gchar *expected)
+{
+	return test_utf8_strcase_each (src, expected, TRUE);
+}
+
+RESULT
+test_utf8_strdown_each (const gchar *src, const gchar *expected)
+{
+	return test_utf8_strcase_each (src, expected, FALSE);
+}
+
 /*
+ * g_utf8_strup
+ */
+RESULT
+test_utf8_strup ()
+{
+	RESULT r;
+
+	if ((r = test_utf8_strup_each ("aBc", "ABC")) != OK)
+		return r;
+	if ((r = test_utf8_strup_each ("x86-64", "X86-64")) != OK)
+		return r;
+	// U+3B1 U+392 -> U+391 U+392
+	if ((r = test_utf8_strup_each ("\xCE\xB1\xCE\x92", "\xCE\x91\xCE\x92")) != OK)
+		return r;
+	// U+FF21 -> U+FF21
+	if ((r = test_utf8_strup_each ("\xEF\xBC\xA1", "\xEF\xBC\xA1")) != OK)
+		return r;
+	// U+FF41 -> U+FF21
+	if ((r = test_utf8_strup_each ("\xEF\xBD\x81", "\xEF\xBC\xA1")) != OK)
+		return r;
+	// U+10428 -> U+10400
+	if ((r = test_utf8_strup_each ("\xF0\x90\x90\xA8", "\xF0\x90\x90\x80")) != OK)
+		return r;
+
+	return OK;
+}
+
+/*
+ * g_utf8_strdown
+ */
+RESULT
+test_utf8_strdown ()
+{
+	RESULT r;
+
+	if ((r = test_utf8_strdown_each ("aBc", "abc")) != OK)
+		return r;
+	if ((r = test_utf8_strdown_each ("X86-64", "x86-64")) != OK)
+		return r;
+	// U+391 U+3B2 -> U+3B1 U+3B2
+	if ((r = test_utf8_strdown_each ("\xCE\x91\xCE\xB2", "\xCE\xB1\xCE\xB2")) != OK)
+		return r;
+/*
+	// U+FF41 -> U+FF41
+	if ((r = test_utf8_strdown_each ("\xEF\xBC\x81", "\xEF\xBC\x81")) != OK)
+		return r;
+	// U+FF21 -> U+FF41
+	if ((r = test_utf8_strdown_each ("\xEF\xBC\xA1", "\xEF\xBD\x81")) != OK)
+		return r;
+	// U+10400 -> U+10428
+	if ((r = test_utf8_strdown_each ("\xF0\x90\x90\x80", "\xF0\x90\x90\xA8")) != OK)
+		return r;
+*/
+	return OK;
+}
+
+/*
  * test initialization
  */
 
@@ -652,6 +781,8 @@
 	{"g_utf8_get_char", test_utf8_get_char },
 	{"g_utf8_next_char", test_utf8_next_char },
 	{"g_utf8_validate", test_utf8_validate },
+	{"g_utf8_strup", test_utf8_strup},
+	{"g_utf8_strdown", test_utf8_strdown},
 	{NULL, NULL}
 };
 
Index: test/Makefile.am
===================================================================
--- test/Makefile.am	(revision 117383)
+++ test/Makefile.am	(working copy)
@@ -21,6 +21,7 @@
 	pattern.c	\
 	dir.c		\
 	markup.c	\
+	unicode.c	\
 	utf8.c		\
 	endian.c	\
 	module.c	\
Index: TODO
===================================================================
--- TODO	(revision 117383)
+++ TODO	(working copy)
@@ -14,8 +14,7 @@
 
 * Unimplemented, not supported currently:
 
-		g_unichar_tolower	Used for deprecated unmanaged string collation
-		g_unichar_type		Used for deprecated unmanaged string collation
+	(none as yet.)
 
 * Dead Code
 
Index: src/gutf8.c
===================================================================
--- src/gutf8.c	(revision 117383)
+++ src/gutf8.c	(working copy)
@@ -21,6 +21,40 @@
 	return error_quark;
 }
 
+gunichar*
+utf8_case_conv (const gchar *str, gssize len, gboolean upper)
+{
+	glong i, u16len, u32len;
+	gunichar2 *u16str;
+	gunichar *u32str;
+	gchar *u8str;
+	GError **err = NULL;
+
+	u16str = g_utf8_to_utf16 (str, len, NULL, &u16len, err);
+	u32str = g_utf16_to_ucs4 (u16str, u16len, NULL, &u32len, err);
+	for (i = 0; i < u32len; i++) {
+		u32str [i] = upper ? g_unichar_toupper (u32str [i]) : g_unichar_tolower (u32str [i]);
+	}
+	g_free (u16str);
+	u16str = g_ucs4_to_utf16 (u32str, u32len, NULL, &u16len, err);
+	u8str = g_utf16_to_utf8 (u16str, u16len, NULL, NULL, err);
+	g_free (u32str);
+	g_free (u16str);
+	return u8str;
+}
+
+gchar*
+g_utf8_strup (const gchar *str, gssize len)
+{
+	return utf8_case_conv (str, len, TRUE);
+}
+
+gchar*
+g_utf8_strdown (const gchar *str, gssize len)
+{
+	return utf8_case_conv (str, len, FALSE);
+}
+
 gunichar2*
 g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error)
 {
@@ -268,12 +302,14 @@
 	while (len < 0 ? str [in_pos] : in_pos < len) {
 		ch = str [in_pos];
 		if (surrogate) {
-			surrogate = 0;
-			if (ch >= 0xDC00 && ch <= 0xDFFF)
+			if (ch >= 0xDC00 && ch <= 0xDFFF) {
 				codepoint = 0x10000 + (ch - 0xDC00) + ((surrogate - 0xD800) << 10);
-			else
+				surrogate = 0;
+			} else {
+				surrogate = 0;
 				/* invalid surrogate pair */
 				continue;
+			}
 		} else {
 			/* fast path optimization */
 			if (ch < 0x80) {
@@ -296,6 +332,8 @@
 		}
 		in_pos++;
 
+		if (surrogate != 0)
+			continue;
 		if (codepoint < 0x80)
 			ret [out_pos++] = (gchar) codepoint;
 		else if (codepoint < 0x0800) {
Index: src/gunicode.c
===================================================================
--- src/gunicode.c	(revision 117383)
+++ src/gunicode.c	(working copy)
@@ -35,6 +35,7 @@
  */
 #include <stdio.h>
 #include <glib.h>
+#include <unicode-data.h>
 #include <errno.h>
 #ifdef _MSC_VER
 /* FIXME */
@@ -82,17 +83,96 @@
 GUnicodeType 
 g_unichar_type (gunichar c)
 {
-	g_error ("%s", "g_unichar_type is not implemented");
+int i;
+
+	guint16 cp = (guint16) c;
+	for (i = 0; i < unicode_category_ranges_count; i++) {
+		if (cp < unicode_category_ranges [i].start)
+			continue;
+		if (unicode_category_ranges [i].end <= cp)
+			continue;
+		return unicode_category [i] [cp - unicode_category_ranges [i].start];
+	}
+
+	/*
+	// 3400-4DB5: OtherLetter
+	// 4E00-9FC3: OtherLetter
+	// AC00-D7A3: OtherLetter
+	// D800-DFFF: OtherSurrogate
+	// E000-F8FF: OtherPrivateUse
+	// 20000-2A6D6 OtherLetter
+	// F0000-FFFFD OtherPrivateUse
+	// 100000-10FFFD OtherPrivateUse
+	*/
+	if (0x3400 <= cp && cp < 0x4DB5)
+		return G_UNICODE_OTHER_LETTER;
+	if (0x4E00 <= cp && cp < 0x9FC3)
+		return G_UNICODE_OTHER_LETTER;
+	if (0xAC00<= cp && cp < 0xD7A3)
+		return G_UNICODE_OTHER_LETTER;
+	if (0xD800 <= cp && cp < 0xDFFF)
+		return G_UNICODE_SURROGATE;
+	if (0xE000 <= cp && cp < 0xF8FF)
+		return G_UNICODE_PRIVATE_USE;
+	/* since the argument is UTF-16, we cannot check beyond FFFF */
+
+	/* It should match any of above */
 	return 0;
 }
 
 gunichar
+g_unichar_case (gunichar c, gboolean upper)
+{
+	gint8 i, i2;
+	guint32 cp = (guint32) c, v;
+
+	for (i = 0; i < simple_case_map_ranges_count; i++) {
+		if (cp < simple_case_map_ranges [i].start)
+			return c;
+		if (simple_case_map_ranges [i].end <= cp)
+			continue;
+		if (c < 0x10000) {
+			guint16 *tab = upper ? simple_upper_case_mapping_lowarea [i] : simple_lower_case_mapping_lowarea [i];
+			v = tab [cp - simple_case_map_ranges [i].start];
+		} else {
+			i2 = i - (upper ? simple_upper_case_mapping_lowarea_table_count : simple_lower_case_mapping_lowarea_table_count);
+			guint32 *tab = upper ? simple_upper_case_mapping_higharea [i2] : simple_lower_case_mapping_higharea [i2];
+			v = tab [cp - simple_case_map_ranges [i].start];
+		}
+		return v != 0 ? (gunichar) v : c;
+	}
+	return c;
+}
+
+gunichar
+g_unichar_toupper (gunichar c)
+{
+	return g_unichar_case (c, TRUE);
+}
+
+gunichar
 g_unichar_tolower (gunichar c)
 {
-	g_error ("%s", "g_unichar_type is not implemented");
-	return 0;
+	return g_unichar_case (c, FALSE);
 }
 
+gunichar
+g_unichar_totitle (gunichar c)
+{
+	guint8 i;
+	guint32 cp;
+
+	cp = (guint32) c;
+	for (i = 0; i < simple_titlecase_mapping_count; i++) {
+		if (simple_titlecase_mapping [i].codepoint == cp)
+			return simple_titlecase_mapping [i].title;
+		if (simple_titlecase_mapping [i].codepoint > cp)
+			/* it is ordered, hence no more match */
+			break;
+	}
+	return g_unichar_toupper (c);
+}
+
 gboolean
 g_unichar_isxdigit (gunichar c)
 {
Index: src/glib.h
===================================================================
--- src/glib.h	(revision 117383)
+++ src/glib.h	(working copy)
@@ -531,10 +531,41 @@
 typedef guint32 gunichar;
 
 typedef enum {
+	G_UNICODE_CONTROL,
+	G_UNICODE_FORMAT,
+	G_UNICODE_UNASSIGNED,
+	G_UNICODE_PRIVATE_USE,
+	G_UNICODE_SURROGATE,
 	G_UNICODE_LOWERCASE_LETTER,
+	G_UNICODE_MODIFIER_LETTER,
+	G_UNICODE_OTHER_LETTER,
+	G_UNICODE_TITLECASE_LETTER,
+	G_UNICODE_UPPERCASE_LETTER,
+	G_UNICODE_COMBINING_MARK,
+	G_UNICODE_ENCLOSING_MARK,
+	G_UNICODE_NON_SPACING_MARK,
+	G_UNICODE_DECIMAL_NUMBER,
+	G_UNICODE_LETTER_NUMBER,
+	G_UNICODE_OTHER_NUMBER,
+	G_UNICODE_CONNECT_PUNCTUATION,
+	G_UNICODE_DASH_PUNCTUATION,
+	G_UNICODE_CLOSE_PUNCTUATION,
+	G_UNICODE_FINAL_PUNCTUATION,
+	G_UNICODE_INITIAL_PUNCTUATION,
+	G_UNICODE_OTHER_PUNCTUATION,
+	G_UNICODE_OPEN_PUNCTUATION,
+	G_UNICODE_CURRENCY_SYMBOL,
+	G_UNICODE_MODIFIER_SYMBOL,
+	G_UNICODE_MATH_SYMBOL,
+	G_UNICODE_OTHER_SYMBOL,
+	G_UNICODE_LINE_SEPARATOR,
+	G_UNICODE_PARAGRAPH_SEPARATOR,
+	G_UNICODE_SPACE_SEPARATOR
 } GUnicodeType;
 
+gunichar       g_unichar_toupper (gunichar c);
 gunichar       g_unichar_tolower (gunichar c);
+gunichar       g_unichar_totitle (gunichar c);
 GUnicodeType   g_unichar_type    (gunichar c);
 gboolean       g_unichar_isxdigit (gunichar c);
 gint           g_unichar_xdigit_value (gunichar c);
@@ -570,6 +601,8 @@
 	G_CONVERT_ERROR_NOT_ABSOLUTE_PATH
 } GConvertError;
 
+gchar* g_utf8_strup (const gchar *str, gssize len);
+gchar* g_utf8_strdown (const gchar *str, gssize len);
 gunichar2 *g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error);
 gchar     *g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error);
 gunichar2 *g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error);

unicode-data.tar.bz2
Description: application/bzip

_______________________________________________
Mono-devel-list mailing list
Mono-devel-list@lists.ximian.com
http://lists.ximian.com/mailman/listinfo/mono-devel-list

[Mono-dev] eglib patch to implement some unicode stuff

Reply via email to