From d6b648aba391ebdea610c8cc746fbfe07e270b91 Mon Sep 17 00:00:00 2001
From: kerams <kerams@users.noreply.github.com>
Date: Wed, 22 Aug 2025 20:00:00 +0000
Subject: [PATCH] Add Hebrew and Arabic combining characters to unaccent.rules

---
 contrib/unaccent/generate_unaccent_rules.py |  18 +++-
 contrib/unaccent/unaccent.rules             | 103 ++++++++++++++++++++
 2 files changed, 118 insertions(+), 3 deletions(-)

diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
index 40822d0c176..8db0edb4b96 100644
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -49,10 +49,22 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')),  # Latin lower case
 # combining (Mc). We identify the ranges of marks we feel safe removing.
 # References:
 #   https://en.wikipedia.org/wiki/Combining_character
-#   https://www.unicode.org/charts/PDF/U0300.pdf
-#   https://www.unicode.org/charts/PDF/U20D0.pdf
+#   https://www.unicode.org/charts/PDF/U0300.pdf   (Combining Diacritical Marks)
+#   https://www.unicode.org/charts/PDF/U20D0.pdf   (Combining Diacritical Marks for Symbols)
+#   https://www.unicode.org/charts/PDF/U0590.pdf   (Hebrew block)
+#   https://www.unicode.org/charts/PDF/U0600.pdf   (Arabic block)
 COMBINING_MARK_RANGES = ((0x0300, 0x0362),   # Mn: Accents, IPA
-                         (0x20dd, 0x20E0),   # Me: Symbols
+                         (0x0591, 0x05bd),   # Mn: Hebrew points, accents
+                         (0x05bf, 0x05c2),   # Mn: Hebrew rafe, shin/sin dots
+                         (0x05c4, 0x05c5),   # Mn: Hebrew marks
+                         (0x05c7, 0x05c7),   # Mn: Hebrew qamats qatan
+                         (0x0610, 0x061a),   # Mn: Arabic signs
+                         (0x064b, 0x065f),   # Mn: Arabic vowel marks
+                         (0x0670, 0x0670),   # Mn: Arabic superscript alef
+                         (0x06d6, 0x06e4),   # Mn: Arabic small signs
+                         (0x06e7, 0x06e8),   # Mn: Arabic small signs
+                         (0x06ea, 0x06ed),   # Mn: Arabic small signs
+                         (0x20dd, 0x20e0),   # Me: Symbols
                          (0x20e2, 0x20e4),)  # Me: Screen, keycap, triangle
 
 
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
index 35fd246b71f..7f922f4078d 100644
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -560,6 +560,109 @@
 Ϲ	Σ
 Ё	Е
 ё	е
+֑
+֒
+֓
+֔
+֕
+֖
+֗
+֘
+֙
+֚
+֛
+֜
+֝
+֞
+֟
+֠
+֡
+֢
+֣
+֤
+֥
+֦
+֧
+֨
+֩
+֪
+֫
+֬
+֭
+֮
+֯
+ְ
+ֱ
+ֲ
+ֳ
+ִ
+ֵ
+ֶ
+ַ
+ָ
+ֹ
+ֺ
+ֻ
+ּ
+ֽ
+ֿ
+ׁ
+ׂ
+ׄ
+ׅ
+ׇ
+ؐ
+ؑ
+ؒ
+ؓ
+ؔ
+ؕ
+ؖ
+ؗ
+ؘ
+ؙ
+ؚ
+ً
+ٌ
+ٍ
+َ
+ُ
+ِ
+ّ
+ْ
+ٓ
+ٔ
+ٕ
+ٖ
+ٗ
+٘
+ٙ
+ٚ
+ٛ
+ٜ
+ٝ
+ٞ
+ٟ
+ٰ
+ۖ
+ۗ
+ۘ
+ۙ
+ۚ
+ۛ
+ۜ
+۟
+۠
+ۡ
+ۢ
+ۣ
+ۤ
+ۧ
+ۨ
+۪
+۫
+۬
+ۭ
 ᴀ	A
 ᴁ	AE
 ᴃ	B
-- 
2.49.0.windows.1

