On Wed, Jun 7, 2017 at 1:06 AM, Man Trieu <man.tr...@gmail.com> wrote:
> 2017-06-07 0:31 GMT+09:00 Bruce Momjian <br...@momjian.us>:
>>
>> On Wed, Jun  7, 2017 at 12:10:25AM +0900, Dang Minh Huong wrote:
>> > > On Jun 4, 29 Heisei, at 00:48, Bruce Momjian <br...@momjian.us> wrote:
>> > >>>> Shouldn't you use "or is_letter_with_marks()", instead of "or
>> > >>>> len(...)
>> > >>>>> 1"?  Your test might catch something that isn't based on a
>> > >>>>> 'letter'
>> > >>>> (according to is_plain_letter).  Otherwise this looks pretty good
>> > >>>> to
>> > >>>> me.  Please add it to the next commitfest.
>> > >>>
>> > >>> Thanks for confirm, sir.
>> > >>> I will add it to the next CF soon.
>> > >>
>> > >> Sorry for lately response. I attach the update patch.
>> > >
>> > > Uh, there is no patch attached.
>> > >
>> >
>> > Sorry sir, reattach the patch.
>> > I also added it to the next CF and set reviewers to Thomas Munro. Could
>> > you confirm for me.
>>
>> There seems to be a problem.  I can't see a patch dated 2017-06-07 on
>> the commitfest page:
>>
>>         https://commitfest.postgresql.org/14/1161/
>>
>> I added the thread but there was no change.  (I think the thread was
>> already present.)  It appears it is not seeing this patch as the latest
>> patch.
>>
>> Does anyone know why this is happening?
>
> May be due to my Mac's mailer? Sorry but I try one more time to attach the
> patch by webmail.

I have finally been able to look at this patch.
(Surprised to see that generate_unaccent_rules.py is inconsistent on
MacOS, runs fine on Linux).

 def get_plain_letter(codepoint, table):
     """Return the base codepoint without marks."""
     if is_letter_with_marks(codepoint, table):
-        return table[codepoint.combining_ids[0]]
+        if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
+            # Recursive to find the plain letter
+            return get_plain_letter(table[codepoint.combining_ids[0]],table)
+        elif is_plain_letter(table[codepoint.combining_ids[0]]):
+            return table[codepoint.combining_ids[0]]
+        else:
+            return None
     elif is_plain_letter(codepoint):
         return codepoint
     else:
-        raise "mu"
+        return None
The code paths returning None should not be reached, so I would
suggest adding an assertion instead. Callers of get_plain_letter would
blow up on None, still that would make future debugging harder.

 def is_letter_with_marks(codepoint, table):
-    """Returns true for plain letters combined with one or more marks."""
+    """Returns true for letters combined with one or more marks."""
     # See 
http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
     return len(codepoint.combining_ids) > 1 and \
-           is_plain_letter(table[codepoint.combining_ids[0]]) and \
+           (is_plain_letter(table[codepoint.combining_ids[0]]) or\
+            is_letter_with_marks(table[codepoint.combining_ids[0]],table))
and \
            all(is_mark(table[i]) for i in codepoint.combining_ids[1:]
This was already hard to follow, and this patch makes its harder. I
think that the thing should be refactored with multiple conditions.

             if is_letter_with_marks(codepoint, table):
-                charactersSet.add((codepoint.id,
+                if get_plain_letter(codepoint, table) <> None:
+                    charactersSet.add((codepoint.id,
This change is not necessary as a letter with marks is not a plain
character anyway.

Testing with characters having two accents, the results are produced
as wanted. I am attaching an updated patch with all those
simplifications. Thoughts?
-- 
Michael
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
index a5eb42f0b1..9135ec23ce 100644
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -48,24 +48,47 @@ def is_mark(codepoint):
     return codepoint.general_category in ("Mn", "Me", "Mc")
 
 def is_letter_with_marks(codepoint, table):
-    """Returns true for plain letters combined with one or more marks."""
+    """Returns true for letters combined with one or more marks."""
     # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
-    return len(codepoint.combining_ids) > 1 and \
-           is_plain_letter(table[codepoint.combining_ids[0]]) and \
-           all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
+
+    # Letter may have no combining characters, in which case it has
+    # no marks.
+    if len(codepoint.combining_ids) == 1:
+        return False
+
+    # A letter without diatritical marks has none of them.
+    if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False:
+        return False
+
+    # Check if the base letter of this letter has marks.
+    codepoint_base = codepoint.combining_ids[0]
+    if (is_plain_letter(table[codepoint_base]) is False and \
+        is_letter_with_marks(table[codepoint_base], table) is False):
+        return False
+
+    return True
 
 def is_letter(codepoint, table):
     """Return true for letter with or without diacritical marks."""
     return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
 
 def get_plain_letter(codepoint, table):
-    """Return the base codepoint without marks."""
+    """Return the base codepoint without marks. If this codepoint has more
+    than one combining character, do a recursive lookup on the table to
+    find out its plain base letter."""
     if is_letter_with_marks(codepoint, table):
-        return table[codepoint.combining_ids[0]]
+        if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
+            return get_plain_letter(table[codepoint.combining_ids[0]], table)
+        elif is_plain_letter(table[codepoint.combining_ids[0]]):
+            return table[codepoint.combining_ids[0]]
+
+        # Should not come here
+        assert(False)
     elif is_plain_letter(codepoint):
         return codepoint
-    else:
-        raise "mu"
+
+    # Should not come here
+    assert(False)
 
 def is_ligature(codepoint, table):
     """Return true for letters combined with letters."""
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
index 84886da587..97f9ed47cf 100644
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -254,6 +254,18 @@
 ǒ	o
 Ǔ	U
 ǔ	u
+Ǖ	U
+ǖ	u
+Ǘ	U
+ǘ	u
+Ǚ	U
+ǚ	u
+Ǜ	U
+ǜ	u
+Ǟ	A
+ǟ	a
+Ǡ	A
+ǡ	a
 Ǥ	G
 ǥ	g
 Ǧ	G
@@ -262,6 +274,8 @@
 ǩ	k
 Ǫ	O
 ǫ	o
+Ǭ	O
+ǭ	o
 ǰ	j
 DZ	DZ
 Dz	Dz
@@ -270,6 +284,8 @@
 ǵ	g
 Ǹ	N
 ǹ	n
+Ǻ	A
+ǻ	a
 Ȁ	A
 ȁ	a
 Ȃ	A
@@ -307,8 +323,14 @@
 ȧ	a
 Ȩ	E
 ȩ	e
+Ȫ	O
+ȫ	o
+Ȭ	O
+ȭ	o
 Ȯ	O
 ȯ	o
+Ȱ	O
+ȱ	o
 Ȳ	Y
 ȳ	y
 ȴ	l
@@ -441,6 +463,8 @@
 ḅ	b
 Ḇ	B
 ḇ	b
+Ḉ	C
+ḉ	c
 Ḋ	D
 ḋ	d
 Ḍ	D
@@ -451,10 +475,16 @@
 ḑ	d
 Ḓ	D
 ḓ	d
+Ḕ	E
+ḕ	e
+Ḗ	E
+ḗ	e
 Ḙ	E
 ḙ	e
 Ḛ	E
 ḛ	e
+Ḝ	E
+ḝ	e
 Ḟ	F
 ḟ	f
 Ḡ	G
@@ -471,6 +501,8 @@
 ḫ	h
 Ḭ	I
 ḭ	i
+Ḯ	I
+ḯ	i
 Ḱ	K
 ḱ	k
 Ḳ	K
@@ -479,6 +511,8 @@
 ḵ	k
 Ḷ	L
 ḷ	l
+Ḹ	L
+ḹ	l
 Ḻ	L
 ḻ	l
 Ḽ	L
@@ -497,6 +531,14 @@
 ṉ	n
 Ṋ	N
 ṋ	n
+Ṍ	O
+ṍ	o
+Ṏ	O
+ṏ	o
+Ṑ	O
+ṑ	o
+Ṓ	O
+ṓ	o
 Ṕ	P
 ṕ	p
 Ṗ	P
@@ -505,12 +547,20 @@
 ṙ	r
 Ṛ	R
 ṛ	r
+Ṝ	R
+ṝ	r
 Ṟ	R
 ṟ	r
 Ṡ	S
 ṡ	s
 Ṣ	S
 ṣ	s
+Ṥ	S
+ṥ	s
+Ṧ	S
+ṧ	s
+Ṩ	S
+ṩ	s
 Ṫ	T
 ṫ	t
 Ṭ	T
@@ -525,6 +575,10 @@
 ṵ	u
 Ṷ	U
 ṷ	u
+Ṹ	U
+ṹ	u
+Ṻ	U
+ṻ	u
 Ṽ	V
 ṽ	v
 Ṿ	V
@@ -563,12 +617,42 @@
 ạ	a
 Ả	A
 ả	a
+Ấ	A
+ấ	a
+Ầ	A
+ầ	a
+Ẩ	A
+ẩ	a
+Ẫ	A
+ẫ	a
+Ậ	A
+ậ	a
+Ắ	A
+ắ	a
+Ằ	A
+ằ	a
+Ẳ	A
+ẳ	a
+Ẵ	A
+ẵ	a
+Ặ	A
+ặ	a
 Ẹ	E
 ẹ	e
 Ẻ	E
 ẻ	e
 Ẽ	E
 ẽ	e
+Ế	E
+ế	e
+Ề	E
+ề	e
+Ể	E
+ể	e
+Ễ	E
+ễ	e
+Ệ	E
+ệ	e
 Ỉ	I
 ỉ	i
 Ị	I
@@ -577,10 +661,40 @@
 ọ	o
 Ỏ	O
 ỏ	o
+Ố	O
+ố	o
+Ồ	O
+ồ	o
+Ổ	O
+ổ	o
+Ỗ	O
+ỗ	o
+Ộ	O
+ộ	o
+Ớ	O
+ớ	o
+Ờ	O
+ờ	o
+Ở	O
+ở	o
+Ỡ	O
+ỡ	o
+Ợ	O
+ợ	o
 Ụ	U
 ụ	u
 Ủ	U
 ủ	u
+Ứ	U
+ứ	u
+Ừ	U
+ừ	u
+Ử	U
+ử	u
+Ữ	U
+ữ	u
+Ự	U
+ự	u
 Ỳ	Y
 ỳ	y
 Ỵ	Y
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to