source

Jonathan Clark (via logerrit) Thu, 09 May 2024 12:24:18 -0700

 i18npool/CustomTarget_breakiterator.mk                       |    6 
 i18npool/qa/cppunit/test_breakiterator.cxx                   |  354 +++--
 i18npool/source/breakiterator/data/dict_word.txt             |  267 ++--
 i18npool/source/breakiterator/data/dict_word_he.txt          |  139 --
 i18npool/source/breakiterator/data/dict_word_hu.txt          |  324 ++---
 i18npool/source/breakiterator/data/dict_word_nodash.txt      |  147 --
 i18npool/source/breakiterator/data/dict_word_prepostdash.txt |  288 ++--
 i18npool/source/breakiterator/data/edit_word.txt             |  261 ++--
 i18npool/source/breakiterator/data/edit_word_he.txt          |  142 --
 i18npool/source/breakiterator/data/edit_word_hu.txt          |  294 ++--
 i18npool/source/breakiterator/data/line.txt                  |  680 +++--------
 i18npool/source/breakiterator/data/sent.txt                  |  128 --
 12 files changed, 1306 insertions(+), 1724 deletions(-)


New commits:
commit 44699b3de37f07090ac6fee1cd97aa76036e9700
Author:     Jonathan Clark <[email protected]>
AuthorDate: Wed Apr 17 09:09:50 2024 -0600
Commit:     Caolán McNamara <[email protected]>
CommitDate: Thu May 9 21:23:50 2024 +0200

    tdf#49885 BreakIterator rule upgrades
    
    This change re-bases the BreakIterator rule customizations on top of a
    clean copy of the ICU 74.2 rules.
    
    Change-Id: Iadcf16cab138cc6c869fac61ad64e996e65b5ae4
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/166273
    Tested-by: Jenkins
    Tested-by: Caolán McNamara <[email protected]>
    Reviewed-by: Caolán McNamara <[email protected]>

diff --git a/i18npool/CustomTarget_breakiterator.mk 
b/i18npool/CustomTarget_breakiterator.mk
index aaba3c1503de..095672878f3d 100644
--- a/i18npool/CustomTarget_breakiterator.mk
+++ b/i18npool/CustomTarget_breakiterator.mk
@@ -16,16 +16,12 @@ $(call gb_CustomTarget_get_target,i18npool/breakiterator) : 
\
 
 i18npool_BRKTXTS := \
     count_word.brk \
-    $(call gb_Helper_optional_locale,he,dict_word_he.brk) \
     $(call gb_Helper_optional_locale,hu,dict_word_hu.brk) \
-    dict_word_nodash.brk \
     dict_word_prepostdash.brk \
     dict_word.brk \
-    $(call gb_Helper_optional_locale,he,edit_word_he.brk) \
     $(call gb_Helper_optional_locale,hu,edit_word_hu.brk) \
     edit_word.brk \
-    line.brk \
-    sent.brk
+    line.brk
 
 # 'gencmn', 'genbrk' and 'genccode' are tools generated and delivered by icu 
project to process icu breakiterator rules.
 # The output of gencmn generates warnings under Windows. We want to minimize 
the patches to external tools,
diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx 
b/i18npool/qa/cppunit/test_breakiterator.cxx
index 2ac46f9bdca6..0103637989e4 100644
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -184,11 +184,10 @@ void TestBreakIterator::testLineBreaking()
 
         {
             // Per the bug, the line break should leave -bar clumped together 
on the next line.
-            // However, this change was reverted at some point. This test 
asserts the new behavior.
             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
                 u"foo -bar"_ustr, strlen("foo -ba"), aLocale, 0, aHyphOptions, 
aUserOptions);
             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first dash",
-                                         static_cast<sal_Int32>(5), 
aResult.breakIndex);
+                                         static_cast<sal_Int32>(4), 
aResult.breakIndex);
         }
     }
 
@@ -198,11 +197,29 @@ void TestBreakIterator::testLineBreaking()
         aLocale.Country = "US";
 
         {
-            // Here we want the line break to leave C:\Program Files\ on the 
first line
+            // Note that the current behavior deviates from the original fix 
for this bug.
+            //
+            // The original report was filed due to wrapping all of "\Program 
Filesaaa" to the
+            // next line, even though only "aaaa" overflowed. The original fix 
was to simply make
+            // U+005C reverse solidus (backslash) a breaking character.
+            //
+            // However, the root cause for this bug was not the behavior of 
'\', but rather some
+            // other bug making all of "\Program Files\" behave like a single 
token, despite it
+            // even containing whitespace.
+            //
+            // Reverting to the ICU line rules fixes this root issue. Now, in 
the following,
+            // "C:\Program" and "Files\LibreOffice" are treated as separate 
tokens. This is also
+            // consistent with the behavior of other office programs.
             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
                 u"C:\Program Files\LibreOffice"_ustr, strlen("C:\Program 
Files\Libre"), aLocale, 0,
                 aHyphOptions, aUserOptions);
-            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), 
aResult.breakIndex);
+            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), 
aResult.breakIndex);
+
+            // An identical result should be generated for solidus.
+            aResult = m_xBreak->getLineBreak(
+                "C:/Program Files/LibreOffice", strlen("C:/Program 
Files/Libre"), aLocale, 0,
+                aHyphOptions, aUserOptions);
+            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), 
aResult.breakIndex);
         }
     }
 
@@ -251,23 +268,125 @@ void TestBreakIterator::testLineBreaking()
         aLocale.Country = "US";
 
         {
+            // The root cause for this bug was the Unicode standard 
introducing special treatment
+            // for '-' in a number range context. This change makes number 
ranges (e.g. "100-199")
+            // behave as if they are single tokens for the purposes of line 
breaking. Unfortunately,
+            // this caused a significant appearance change to existing 
documents.
+            //
+            // Despite being a user-visible layout change, this isn't exactly 
a bug. Wrapping
+            // number ranges as a single token is consistent with other 
applications, including web
+            // browsers, and other office suites as mentioned in the bug 
discussion. Removing this
+            // customization seems like it would be a major change, however.
+            //
             // Here we want the line break to leave 100- clumped on the first 
line.
+
             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
                 u"word 100-199 word"_ustr, strlen("word 100-1"), aLocale, 0, 
aHyphOptions, aUserOptions);
             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(9), 
aResult.breakIndex);
         }
-    }
 
-    // i#83649: Line break should be between typographical quote and left 
bracket
-    {
+        {
+            // From the same bug: "the leading minus must stay with numbers 
and strings"
+
+            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
+                    "range of -100.000 to 100.000", strlen("range of -1"), 
aLocale, 0,
+                    aHyphOptions, aUserOptions);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex);
+
+            constexpr OUString str = u"range of \u2212100.000 to 100.000"_ustr;
+            aResult = m_xBreak->getLineBreak(
+                    str, strlen("range of -"), aLocale, 0, aHyphOptions, 
aUserOptions);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex);
+        }
+
         aLocale.Language = "de";
         aLocale.Country = "DE";
 
         {
-            // Here we want the line break to leave »angetan werden« on the 
first line
+            // From the same bug: "the leading minus must stay with numbers 
and strings"
+
+            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
+                    "EURO is -10,50", strlen("EURO is -1"), aLocale, 0, 
aHyphOptions, aUserOptions);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex);
+
+            // Also the mathematical minus sign:
+
+            constexpr OUString str = u"EURO is \u221210,50"_ustr;
+            aResult = m_xBreak->getLineBreak(
+                    str, strlen("EURO is -"), aLocale, 0, aHyphOptions, 
aUserOptions);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex);
+        }
+
+        {
+            // From the same bug: "the leading minus must stay with numbers 
and strings"
+
+            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
+                    "und -kosten", strlen("und -ko"), aLocale, 0,
+                    aHyphOptions, aUserOptions);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32{4}, aResult.breakIndex);
+
+            // But not the non-breaking hyphen:
+
+            constexpr OUString str = u"und \u2011"_ustr;
+            aResult = m_xBreak->getLineBreak(
+                    str, strlen("und -ko"), aLocale, 0, aHyphOptions, 
aUserOptions);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32{5}, aResult.breakIndex);
+        }
+    }
+
+    // i#83649: "Line break should be between typographical quote and left 
bracket"
+    // - Actually: Spaces between quotation mark and opening punctuation not 
treated as a break.
+    // - Note that per the Unicode standard, prohibiting breaks in this 
context is intentional
+    // because it may cause issues in certain languages due to the various 
ways quotation
+    // characters are used.
+    // - We do it anyway by customizing the ICU line breaking rules.
+    {
+        {
+            // This uses the sample text provided in the bug report. Based on 
usage, it is assumed
+            // they were in the de_DE locale.
+
+            aLocale.Language = "de";
+            aLocale.Country = "DE";
+
+            // Per the bug report, it is expected that »angetan werden« 
remains on the first line.
             const OUString str = u"»angetan werden« [Passiv]"_ustr;
             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
-                str, strlen("Xangetan werdenX ["), aLocale, 0, aHyphOptions, 
aUserOptions);
+                str, str.getLength() - 4, aLocale, 0, aHyphOptions, 
aUserOptions);
+            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), 
aResult.breakIndex);
+
+            // The same result should be returned for this and the first case.
+            const OUString str2 = u"»angetan werden« Passiv"_ustr;
+            aResult = m_xBreak->getLineBreak(
+                str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, 
aUserOptions);
+            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), 
aResult.breakIndex);
+
+            // Under ICU rules, no amount of spaces would cause this to wrap.
+            const OUString str3 = u"»angetan werden«    [Passiv]"_ustr;
+            aResult = m_xBreak->getLineBreak(
+                str3, str3.getLength() - 4, aLocale, 0, aHyphOptions, 
aUserOptions);
+            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(20), 
aResult.breakIndex);
+
+            // However, tabs will
+            const OUString str4 = u"»angetan werden«   [Passiv]"_ustr;
+            aResult = m_xBreak->getLineBreak(
+                str4, str4.getLength() - 4, aLocale, 0, aHyphOptions, 
aUserOptions);
+            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), 
aResult.breakIndex);
+        }
+
+        {
+            // The same behavior is seen in English
+
+            aLocale.Language = "en";
+            aLocale.Country = "US";
+
+            const OUString str = u"\"angetan werden\" [Passiv]"_ustr;
+            i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
+                str, str.getLength() - 4, aLocale, 0, aHyphOptions, 
aUserOptions);
+            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), 
aResult.breakIndex);
+
+            const OUString str2 = u"\"angetan werden\" Passiv"_ustr;
+            aResult = m_xBreak->getLineBreak(
+                str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, 
aUserOptions);
             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), 
aResult.breakIndex);
         }
     }
@@ -355,7 +474,7 @@ void TestBreakIterator::testLineBreaking()
             auto res = m_xBreak->getLineBreak(u"Wort -prinzessinnen, 
wort"_ustr,
                                               strlen("Wort -prinzessinnen,"), 
aLocale, 0,
                                               aHyphOptions, aUserOptions);
-            CPPUNIT_ASSERT_EQUAL(sal_Int32{ 6 }, res.breakIndex);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32{ 5 }, res.breakIndex);
         }
     }
 }
@@ -638,7 +757,8 @@ void TestBreakIterator::testWordBoundaries()
         CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
     }
 
-    //See https://bz.apache.org/ooo/show_bug.cgi?id=85411
+    // i#85411: ZWSP should be a word separator for spellchecking
+    // - This fix was applied to both dict and edit customizations
     for (int j = 0; j < 3; ++j)
     {
         switch (j)
@@ -660,21 +780,23 @@ void TestBreakIterator::testWordBoundaries()
                 break;
         }
 
-        static constexpr OUString aTest =
-            u"I\u200Bwant\u200Bto\u200Bgo"_ustr;
+        static constexpr OUString aTest = u"I\u200Bwant\u200Bto\u200Bgo"_ustr;
 
         sal_Int32 nPos = 0;
-        sal_Int32 aExpected[] = {1, 6, 9, 12};
+        sal_Int32 aExpected[] = { 1, 6, 9, 12 };
         size_t i = 0;
         do
         {
             CPPUNIT_ASSERT(i < std::size(aExpected));
-            nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
-                i18n::WordType::DICTIONARY_WORD, true).endPos;
-            CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
+            auto dwPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
+                                                   
i18n::WordType::DICTIONARY_WORD, true);
+            CPPUNIT_ASSERT_EQUAL(aExpected[i], dwPos.endPos);
+            auto ewPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
+                                                   
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+            CPPUNIT_ASSERT_EQUAL(aExpected[i], ewPos.endPos);
+            nPos = dwPos.endPos;
             ++i;
-        }
-        while (nPos++ < aTest.getLength());
+        } while (nPos++ < aTest.getLength());
         CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
     }
 
@@ -814,121 +936,45 @@ void TestBreakIterator::testWordBoundaries()
     }
 
     // i#56347: "BreakIterator patch for Hungarian"
-    // Rules for Hungarian affixes after numbers and certain symbols
-    {
-        auto mode = i18n::WordType::DICTIONARY_WORD;
-        aLocale.Language = "hu";
-        aLocale.Country = "HU";
-
-        OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr;
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
-    }
-
     // i#56348: Special chars in first pos not handled by spell checking in 
Writer (Hungarian)
-    // Rules for Hungarian affixes after numbers and certain symbols in edit 
mode.
-    // The patch was merged, but the original bug was never closed and the 
current behavior seems
-    // identical to the ICU default behavior. Added this test to ensure that 
doesn't change.
+    // Rules for Hungarian affixes after numbers and certain symbols
     {
-        auto mode = i18n::WordType::ANY_WORD;
         aLocale.Language = "hu";
         aLocale.Country = "HU";
 
         OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr;
 
-        aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 17, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 19, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 20, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.endPos);
-
-        aBounds = m_xBreak->getWordBoundary(aTest, 21, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
+        for (auto mode :
+             { i18n::WordType::DICTIONARY_WORD, 
i18n::WordType::ANYWORD_IGNOREWHITESPACES })
+        {
+            aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
 
-        aBounds = m_xBreak->getWordBoundary(aTest, 24, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.endPos);
+            aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
 
-        aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.endPos);
+            aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, 
true);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
 
-        aBounds = m_xBreak->getWordBoundary(aTest, 26, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.endPos);
+            aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, 
true);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
 
-        aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
+            aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, 
true);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
 
-        aBounds = m_xBreak->getWordBoundary(aTest, 30, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.endPos);
+            aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, 
true);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
 
-        aBounds = m_xBreak->getWordBoundary(aTest, 31, aLocale, mode, true);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
-        CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
+            aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, 
true);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
+            CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
+        }
     }
 
     // tdf#49885: Upgrade CJ word boundary analysis to ICU frequency-based 
analysis
@@ -983,6 +1029,56 @@ void TestBreakIterator::testSentenceBoundaries()
         CPPUNIT_ASSERT_EQUAL(sal_Int32(24), m_xBreak->beginOfSentence(aTest, 
26, aLocale));
         CPPUNIT_ASSERT_EQUAL(sal_Int32(53), m_xBreak->endOfSentence(aTest, 26, 
aLocale));
     }
+
+    // i#55063: Sentence selection in Thai should select a space-delimited 
phrase.
+    // - This customization broke at some point. It works in an English locale 
in a synthetic test
+    // like this one, but does not work in the Thai locale, nor on Thai text 
in practice.
+    {
+        static constexpr OUString aTest = u"ว้อย 
โหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอก โปรโมเตอร์"_ustr;
+
+        aLocale.Language = "en";
+        aLocale.Country = "US";
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 
23, aLocale));
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, 
aLocale));
+
+        aLocale.Language = "th";
+        aLocale.Country = "TH";
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 
23, aLocale));
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, 
aLocale));
+    }
+
+    // i#55063: Thai phrases should delimit English sentence selection.
+    // - This customization broke at some point. It works in an English locale 
in a synthetic test
+    // like this one, but does not work in the Thai locale, nor on Thai text 
in practice.
+    {
+        static constexpr OUString aTest = u"ว้อย English usually ends with a 
period โปรโมเตอร์."_ustr;
+
+        aLocale.Language = "en";
+        aLocale.Country = "US";
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 
23, aLocale));
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, 
aLocale));
+
+        aLocale.Language = "th";
+        aLocale.Country = "TH";
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 
23, aLocale));
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, 
aLocale));
+    }
+
+    // i#55063: Characteristic test for English text delimiting Thai phrases 
(sentences)
+    // - English text should not delimit Thai phrases.
+    {
+        static constexpr OUString aTest = 
u"Englishโหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอกEnglish"_ustr;
+
+        aLocale.Language = "en";
+        aLocale.Country = "US";
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 
23, aLocale));
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, 
aLocale));
+
+        aLocale.Language = "th";
+        aLocale.Country = "TH";
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 
23, aLocale));
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, 
aLocale));
+    }
 }
 
 //See https://bugs.libreoffice.org/show_bug.cgi?id=40292
@@ -1559,6 +1655,7 @@ void TestBreakIterator::testLegacyHebrewQuoteInsideWord()
     aLocale.Language = "he";
     aLocale.Country = "IL";
 
+    // i#51661: Add quotation mark as middle letter for Hebrew
     {
         auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr;
 
@@ -1572,6 +1669,21 @@ void TestBreakIterator::testLegacyHebrewQuoteInsideWord()
         CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
         CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
     }
+
+    // i#51661: Add quotation mark as middle letter for Hebrew
+    {
+        auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr;
+
+        i18n::Boundary aBounds = m_xBreak->getWordBoundary(
+            aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES, 
false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
+    }
 }
 
 void TestBreakIterator::testLegacySurrogatePairs()
diff --git a/i18npool/source/breakiterator/data/dict_word.txt 
b/i18npool/source/breakiterator/data/dict_word.txt
index b1666f44daab..f804b0eec214 100644
--- a/i18npool/source/breakiterator/data/dict_word.txt
+++ b/i18npool/source/breakiterator/data/dict_word.txt
@@ -1,148 +1,199 @@
 #
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (C) 2002-2016, International Business Machines Corporation
+# and others. All Rights Reserved.
 #
-#   file:  dict_word.txt   
+# file:  word.txt
 #
-#   ICU Word Break Rules
+# ICU Word Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
+# Note:  Updates to word.txt will usually need to be merged into
+#        word_POSIX.txt also.
 
-
-
-####################################################################################
+##############################################################################
 #
 #  Character class definitions from TR 29
 #
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
-
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
-
-$ALetter   = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW 
PUNCTUATION GERESH:]
-                           - $Ideographic
-                           - $Katakana
-                           - $Hangul
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = 
SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] 
-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL 
LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:] [:name = PRIME:] 
-              [:name = HYPHEN-MINUS:] ];
-
-$SufixLetter = [:name= FULL STOP:];
-              
-
-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 
[:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE 
QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
-             [:name = PRIME:]];
-$Numeric   = [:LineBreak = Numeric:];
-
-
-$TheZWSP = \u200b;
+##############################################################################
+
+### BEGIN CUSTOMIZATION
+### This file contains LibreOffice-specific rule customizations.
+###
+### To aid future maintainability:
+### - The change location should be bracketed by comments of this form.
+### - The original rule should be commented out, and the modified rule placed 
alongside.
+### - By doing this, maintainers can more easily compare to an upstream 
baseline.
+###
+### END CUSTOMIZATION
+
+!!chain;
+!!quoted_literals_only;
+
 
 #
 #  Character Class Definitions.
-#    The names are those from TR29.
 #
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
 
+$Han                = [:Han:];
 
+$CR                 = [\p{Word_Break = CR}];
+$LF                 = [\p{Word_Break = LF}];
+$Newline            = [\p{Word_Break = Newline}];
+$Extend             = [\p{Word_Break = Extend}-$Han];
+$ZWJ                = [\p{Word_Break = ZWJ}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format             = [\p{Word_Break = Format}];
+$Katakana           = [\p{Word_Break = Katakana}];
+$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+$ALetter            = [\p{Word_Break = ALetter}];
+$Single_Quote       = [\p{Word_Break = Single_Quote}];
+$Double_Quote       = [\p{Word_Break = Double_Quote}];
+$MidNumLet          = [\p{Word_Break = MidNumLet}];
+$MidNum             = [\p{Word_Break = MidNum}];
+$Numeric            = [\p{Word_Break = Numeric}];
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+$WSegSpace          = [\p{Word_Break = WSegSpace}];
+$Extended_Pict      = [\p{Extended_Pictographic}];
 
+### BEGIN CUSTOMIZATION
+### Unknown issue number: Dictionary words can contain hyphens
+### tdf#49885: Sync custom BreakIterator rules with ICU originals
+### - ICU is now more permissive about punctuation inside words.
+### - For compatibility, exclude certain characters that were previously 
excluded.
 
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin 
Here. 
-#
-####################################################################################
+$IncludedML         = [:name = HYPHEN-MINUS:];
+$ExcludedML         = [[:name = COLON:]
+                       [:name = GREEK ANO TELEIA:]
+                       [:name = PRESENTATION FORM FOR VERTICAL COLON:]
+                       [:name = SMALL COLON:]
+                       [:name = FULLWIDTH COLON:]];
 
-$Format    = [[:Cf:] - $TheZWSP];
+# $MidLetter          = [\p{Word_Break = MidLetter}];
+$MidLetter          = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML];
 
+### END CUSTOMIZATION
 
+$Hiragana           = [:Hiragana:];
+$Ideographic        = [\p{Ideographic}];
 
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in 
Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent 
syllables -
-#          they won't be word boundaries.
-#
 
+#   Dictionary character set, for triggering language-based break engines. 
Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in 
Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include 
all
+#   characters requiring dictionary break.
 
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the 
base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$SufixLetterEx= $SufixLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$IdeographicEx= $Ideographic  $Extend*;
-$HangulEx = $Hangul  $Extend*;
-$FormatEx     = $Format    $Extend*;
+$Control        = [\p{Grapheme_Cluster_Break = Control}];
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$KanaKanji $HangulSyllable];
+$dictionary     = [$ComplexContext $dictionaryCJK];
 
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
 
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
 
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* 
$ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | 
$LetterSequence))* $SufixLetterEx? {200};
 
-[[:P:][:S:]]*;
+## -------------------------------------------------
 
+# Rule 3 - CR x LF
 #
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
+$CR $LF;
 
+# Rule 3c   Do not break within emoji zwj sequences.
+#             ZWJ ×  \p{Extended_Pictographic}.  Precedes WB4, so no 
intervening Extend chars allowed.
 #
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, 
only so that they
-#                           can be tagged with a return value.   TODO:  is 
this what we want?
-#
-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
-$HangulEx ($FormatEx* $HangulEx)* {400};
+$ZWJ $Extended_Pict;
 
+# Rule 3d - Keep horizontal whitespace together.
 #
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
+$WSegSpace $WSegSpace;
+
+# Rule 4 - ignore Format and Extend characters, except when they appear at the 
beginning
+#          of a region of Text.
+
+$ExFm  = [$Extend $Format $ZWJ];
+
+^$ExFm+;            # This rule fires only when there are format or extend 
characters at the
+                    # start of text, or immediately following another 
boundary. It groups them, in
+                    # the event there are more than one.
+
+[^$CR $LF $Newline $ExFm] $ExFm*;   # This rule rule attaches trailing 
format/extends to words,
+                                    # with no special rule status value.
+
+$Numeric $ExFm* {100};              # This group of rules also attach trailing 
format/extends, but
+$ALetterPlus $ExFm* {200};          # with rule status set based on the word's 
final base character.
+$HangulSyllable {200};
+$Hebrew_Letter $ExFm* {200};
+$Katakana $ExFm* {400};             # note:  these status values override 
those from rule 5
+$Hiragana $ExFm* {400};             #        by virtue of being numerically 
larger.
+$Ideographic $ExFm* {400};          #
 
 #
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  
too far,
-#                   but must back up at least enough, and must stop on a 
boundary.)
+# rule 5
+#    Do not break between most letters.
 #
+($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 6 and 7
+($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | 
$Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
+
+# rule 7a
+$Hebrew_Letter $ExFm* $Single_Quote {200};
+
+# rule 7b and 7c
+$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
+
+# rule 8
+
+$Numeric $ExFm* $Numeric;
+
+# rule 9
+
+($ALetterPlus | $Hebrew_Letter)  $ExFm* $Numeric;
+
+# rule 10
+
+$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 11 and 12
+
+$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
+
+# rule 13
+# to be consistent with $KanaKanji $KanaKanhi, changed
+# from 300 to 400.
+# See also TestRuleStatus in intltest/rbbiapts.cpp
+$Katakana $ExFm*  $Katakana {400};
+
+# rule 13a/b
+
+$ALetterPlus   $ExFm* $ExtendNumLet {200};    #  (13a)
+$Hebrew_Letter $ExFm* $ExtendNumLet {200};    #  (13a)
+$Numeric       $ExFm* $ExtendNumLet {100};    #  (13a)
+$Katakana      $ExFm* $ExtendNumLet {400};    #  (13a)
+$ExtendNumLet  $ExFm* $ExtendNumLet {200};    #  (13a)
+
+$ExtendNumLet  $ExFm* $ALetterPlus  {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Hebrew_Letter {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Numeric      {100};    #  (13b)
+$ExtendNumLet  $ExFm* $Katakana     {400};    #  (13b)
 
-# NonStarters are the set of all characters that can appear at the 2nd - nth 
position of
-#    a word.   (They may also be the first.)   The reverse rule skips over 
these, until it
-#    reaches something that can only be the start (and probably only) char in 
a "word".
-#    A space or punctuation meets the test.
+# rules 15 - 17
+#    Pairs of Regional Indicators stay together.
+#    With incoming rule chaining disabled by ^, this rule will match exactly 
two of them.
+#    No other rule begins with a Regional_Indicator, so chaining cannot extend 
the match.
 #
-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] 
$MidLetter $MidNum $SufixLetter $Extend $Format];
+^$Regional_Indicator $ExFm* $Regional_Indicator;
 
-#!.*;
-! ($NonStarters* | 
  ) .;
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji 
found
 
+# Rule 999
+#     Match a single code point if no other rule applies.
+.;
diff --git a/i18npool/source/breakiterator/data/dict_word_he.txt 
b/i18npool/source/breakiterator/data/dict_word_he.txt
deleted file mode 100644
index 40197d92a431..000000000000
--- a/i18npool/source/breakiterator/data/dict_word_he.txt
+++ /dev/null
@@ -1,139 +0,0 @@
-#
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
-#
-#   file:  dict_word.txt   
-#
-#   ICU Word Break Rules
-#      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
-#
-
-
-
-####################################################################################
-#
-#  Character class definitions from TR 29
-#
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
-
-
-$ALetter   = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW 
PUNCTUATION GERESH:]
-                           - $Katakana
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = GRAVE 
ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK 
TONOS:] [:name= FULL STOP:]
-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL 
LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:]];  
-              
-$SufixLetter = [:name= FULL STOP:];
-
-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 
[:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE 
QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
-             [:name = PRIME:]];
-$Numeric   = [:LineBreak = Numeric:];
-
-
-$TheZWSP = \u200b;
-
-#
-#  Character Class Definitions.
-#    The names are those from TR29.
-#
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
-
-
-
-
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin 
Here. 
-#
-####################################################################################
-
-$Format    = [[:Cf:] - $TheZWSP];
-
-
-
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in 
Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent 
syllables -
-#          they won't be word boundaries.
-#
-
-
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the 
base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$SufixLetterEx= $SufixLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$FormatEx     = $Format    $Extend*;
-
-
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
-
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* 
$ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | 
$LetterSequence))* $SufixLetterEx? {200};
-
-[[:P:][:S:]]*;
-
-#
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
-
-#
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, 
only so that they
-#                           can be tagged with a return value.   TODO:  is 
this what we want?
-#
-# [:IDEOGRAPHIC:] $Extend* {400};
-
-#
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
-
-#
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  
too far,
-#                   but must back up at least enough, and must stop on a 
boundary.)
-#
-
-# NonStarters are the set of all characters that can appear at the 2nd - nth 
position of
-#    a word.   (They may also be the first.)   The reverse rule skips over 
these, until it
-#    reaches something that can only be the start (and probably only) char in 
a "word".
-#    A space or punctuation meets the test.
-#
-$NonStarters = [$Numeric $ALetter $Katakana [:P:] [:S:] $MidLetter $MidNum 
$SufixLetter $Extend $Format];
-
-#!.*;
-! ($NonStarters* | 
  ) .;
-
diff --git a/i18npool/source/breakiterator/data/dict_word_hu.txt 
b/i18npool/source/breakiterator/data/dict_word_hu.txt
index b0a0276b36a8..88648e6e5716 100644
--- a/i18npool/source/breakiterator/data/dict_word_hu.txt
+++ b/i18npool/source/breakiterator/data/dict_word_hu.txt
@@ -1,176 +1,222 @@
 #
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (C) 2002-2016, International Business Machines Corporation
+# and others. All Rights Reserved.
 #
-#   file:  dict_word.txt   
+# file:  word.txt
 #
-#   ICU Word Break Rules
+# ICU Word Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
+# Note:  Updates to word.txt will usually need to be merged into
+#        word_POSIX.txt also.
 
-
-
-####################################################################################
+##############################################################################
 #
 #  Character class definitions from TR 29
 #
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
-
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
-
-
-# Fix spelling of a)-ban, b)-ben, when the letter is a reference
-# resulting bad word breaking "ban" and "ben"
-# (reference fields are not expanded in spell checking, yet, only
-# for grammar checking).
-
-$PrefixLetter = [[:name = RIGHT PARENTHESIS:]];
-
-$ALetter   = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW 
PUNCTUATION GERESH:]
-                [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER 
TEN THOUSAND SIGN:]
-                [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO 
SIGN:]
-                [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:]
-                [:name = DIGIT ZERO:]
-                [:name = DIGIT ONE:]
-                [:name = DIGIT TWO:]
-                [:name = DIGIT THREE:]
-                [:name = DIGIT FOUR:]
-                [:name = DIGIT FIVE:]
-                [:name = DIGIT SIX:]
-                [:name = DIGIT SEVEN:]
-                [:name = DIGIT EIGHT:]
-                [:name = DIGIT NINE:]
-                           - $Ideographic
-                           - $Katakana
-                           - $Hangul
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = 
SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:]
-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL 
LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:]
-              [:name = EURO SIGN:] [:name = PERCENT SIGN:] [:name = PER MILLE 
SIGN:] [:name = PER TEN THOUSAND SIGN:]
-              [:name = EN DASH:] [:name = EM DASH:]
-              [:name = RIGHT DOUBLE QUOTATION MARK:]
-              [:name = LEFT PARENTHESIS:]
-              [:name = RIGHT PARENTHESIS:]
-              [:name = RIGHT SQUARE BRACKET:]
-              [:name = EXCLAMATION MARK:]
-              [:name = QUESTION MARK:]
-              [:name = FULL STOP:] [:name = PERCENT SIGN:] [:name = SECTION 
SIGN:] [:name = DEGREE SIGN:]];  
-              
-$SufixLetter = [:name= FULL STOP:];
-
-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 
[:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE 
QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
-             [:name = PRIME:]];
-$Numeric   = [:LineBreak = Numeric:];
-
-
-$TheZWSP = \u200b;
+##############################################################################
+
+### BEGIN CUSTOMIZATION
+### This file contains LibreOffice-specific rule customizations.
+###
+### To aid future maintainability:
+### - The change location should be bracketed by comments of this form.
+### - The original rule should be commented out, and the modified rule placed 
alongside.
+### - By doing this, maintainers can more easily compare to an upstream 
baseline.
+###
+### END CUSTOMIZATION
+
+!!chain;
+!!quoted_literals_only;
+
 
 #
 #  Character Class Definitions.
-#    The names are those from TR29.
 #
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
-
-
 
+$Han                = [:Han:];
+
+$CR                 = [\p{Word_Break = CR}];
+$LF                 = [\p{Word_Break = LF}];
+$Newline            = [\p{Word_Break = Newline}];
+$Extend             = [\p{Word_Break = Extend}-$Han];
+$ZWJ                = [\p{Word_Break = ZWJ}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format             = [\p{Word_Break = Format}];
+$Katakana           = [\p{Word_Break = Katakana}];
+$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+$Single_Quote       = [\p{Word_Break = Single_Quote}];
+$Double_Quote       = [\p{Word_Break = Double_Quote}];
+$MidNumLet          = [\p{Word_Break = MidNumLet}];
+$MidNum             = [\p{Word_Break = MidNum}];
+$Numeric            = [\p{Word_Break = Numeric}];
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+$WSegSpace          = [\p{Word_Break = WSegSpace}];
+$Extended_Pict      = [\p{Extended_Pictographic}];
+
+### BEGIN CUSTOMIZATION
+### Unknown issue number: Dictionary words can contain hyphens
+### tdf#49885: Sync custom BreakIterator rules with ICU originals
+### - ICU is now more permissive about punctuation inside words.
+### - For compatibility, exclude certain characters that were previously 
excluded.
+### tdf#116072: Extend MidLetter in Hungarian word breaking
+### i#56347: BreakIterator patch for Hungarian
+### i#56348: Special chars in first pos not handled by spell checking for 
Hungarian
+
+$Symbols_hu         = [[:name = PERCENT SIGN:]
+                       [:name = PER MILLE SIGN:]
+                       [:name = PER TEN THOUSAND SIGN:]
+                       [:name = SECTION SIGN:]
+                       [:name = DEGREE SIGN:]
+                       [:name = EURO SIGN:]
+                       [:name = HYPHEN-MINUS:]
+                       [:name = EN DASH:]
+                       [:name = EM DASH:]];
+
+#$ALetter            = [\p{Word_Break = ALetter}];
+$ALetter            = [\p{Word_Break = ALetter} $Symbols_hu];
+
+$IncludedML         = [:name = HYPHEN-MINUS:];
+$ExcludedML         = [[:name = COLON:]
+                       [:name = GREEK ANO TELEIA:]
+                       [:name = PRESENTATION FORM FOR VERTICAL COLON:]
+                       [:name = SMALL COLON:]
+                       [:name = FULLWIDTH COLON:]];
+
+$IncludedML_hu      = [[:name = RIGHT DOUBLE QUOTATION MARK:]
+                       [:name = LEFT PARENTHESIS:]
+                       [:name = RIGHT PARENTHESIS:]
+                       [:name = RIGHT SQUARE BRACKET:]
+                       [:name = EXCLAMATION MARK:]
+                       [:name = QUESTION MARK:]
+                       $Symbols_hu];
+
+# $MidLetter          = [\p{Word_Break = MidLetter}];
+$MidLetter          = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML 
$IncludedML_hu];
+
+### END CUSTOMIZATION
+
+$Hiragana           = [:Hiragana:];
+$Ideographic        = [\p{Ideographic}];
+
+
+#   Dictionary character set, for triggering language-based break engines. 
Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in 
Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include 
all
+#   characters requiring dictionary break.
+
+$Control        = [\p{Grapheme_Cluster_Break = Control}];
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$KanaKanji $HangulSyllable];
+$dictionary     = [$ComplexContext $dictionaryCJK];
+
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
+
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
+
+
+## -------------------------------------------------
+
+# Rule 3 - CR x LF
+#
+$CR $LF;
 
-####################################################################################
+# Rule 3c   Do not break within emoji zwj sequences.
+#             ZWJ ×  \p{Extended_Pictographic}.  Precedes WB4, so no 
intervening Extend chars allowed.
 #
-#  Word Break Rules.    Definitions and Rules specific to word break begin 
Here. 
+$ZWJ $Extended_Pict;
+
+# Rule 3d - Keep horizontal whitespace together.
 #
-####################################################################################
+$WSegSpace $WSegSpace;
 
-$Format    = [[:Cf:] - $TheZWSP];
+# Rule 4 - ignore Format and Extend characters, except when they appear at the 
beginning
+#          of a region of Text.
 
+$ExFm  = [$Extend $Format $ZWJ];
 
+^$ExFm+;            # This rule fires only when there are format or extend 
characters at the
+                    # start of text, or immediately following another 
boundary. It groups them, in
+                    # the event there are more than one.
 
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in 
Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent 
syllables -
-#          they won't be word boundaries.
-#
+[^$CR $LF $Newline $ExFm] $ExFm*;   # This rule rule attaches trailing 
format/extends to words,
+                                    # with no special rule status value.
 
+$Numeric $ExFm* {100};              # This group of rules also attach trailing 
format/extends, but
+$ALetterPlus $ExFm* {200};          # with rule status set based on the word's 
final base character.
+$HangulSyllable {200};
+$Hebrew_Letter $ExFm* {200};
+$Katakana $ExFm* {400};             # note:  these status values override 
those from rule 5
+$Hiragana $ExFm* {400};             #        by virtue of being numerically 
larger.
+$Ideographic $ExFm* {400};          #
 
 #
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the 
base char.
+# rule 5
+#    Do not break between most letters.
 #
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$SufixLetterEx= $SufixLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$IdeographicEx= $Ideographic  $Extend*;
-$HangulEx = $Hangul  $Extend*;
-$FormatEx     = $Format    $Extend*;
+($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
 
+# rule 6 and 7
+($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | 
$Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
 
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
+# rule 7a
+$Hebrew_Letter $ExFm* $Single_Quote {200};
 
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $PrefixLetter? $ALetterEx ($FormatEx* $MidLetterEx? 
$FormatEx* $ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | 
$LetterSequence))* $SufixLetterEx? {200};
+# rule 7b and 7c
+$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
 
-[[:P:][:S:]]*;
+# rule 8
 
-#
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
+$Numeric $ExFm* $Numeric;
 
-#
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, 
only so that they
-#                           can be tagged with a return value.   TODO:  is 
this what we want?
-#
-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
-$HangulEx ($FormatEx* $HangulEx)* {400};
+# rule 9
 
-#
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
+($ALetterPlus | $Hebrew_Letter)  $ExFm* $Numeric;
 
-#
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  
too far,
-#                   but must back up at least enough, and must stop on a 
boundary.)
-#
+# rule 10
+
+$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 11 and 12
+
+$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
+
+# rule 13
+# to be consistent with $KanaKanji $KanaKanhi, changed
+# from 300 to 400.
+# See also TestRuleStatus in intltest/rbbiapts.cpp
+$Katakana $ExFm*  $Katakana {400};
+
+# rule 13a/b
+
+$ALetterPlus   $ExFm* $ExtendNumLet {200};    #  (13a)
+$Hebrew_Letter $ExFm* $ExtendNumLet {200};    #  (13a)
+$Numeric       $ExFm* $ExtendNumLet {100};    #  (13a)
+$Katakana      $ExFm* $ExtendNumLet {400};    #  (13a)
+$ExtendNumLet  $ExFm* $ExtendNumLet {200};    #  (13a)
+
+$ExtendNumLet  $ExFm* $ALetterPlus  {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Hebrew_Letter {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Numeric      {100};    #  (13b)
+$ExtendNumLet  $ExFm* $Katakana     {400};    #  (13b)
 
-# NonStarters are the set of all characters that can appear at the 2nd - nth 
position of
-#    a word.   (They may also be the first.)   The reverse rule skips over 
these, until it
-#    reaches something that can only be the start (and probably only) char in 
a "word".
-#    A space or punctuation meets the test.
+# rules 15 - 17
+#    Pairs of Regional Indicators stay together.
+#    With incoming rule chaining disabled by ^, this rule will match exactly 
two of them.
+#    No other rule begins with a Regional_Indicator, so chaining cannot extend 
the match.
 #
-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] 
$MidLetter $MidNum $SufixLetter $Extend $Format];
+^$Regional_Indicator $ExFm* $Regional_Indicator;
 
-#!.*;
-! ($NonStarters* | 
  ) .;
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji 
found
 
+# Rule 999
+#     Match a single code point if no other rule applies.
+.;
diff --git a/i18npool/source/breakiterator/data/dict_word_nodash.txt 
b/i18npool/source/breakiterator/data/dict_word_nodash.txt
deleted file mode 100644
index 279cc50e5b66..000000000000
--- a/i18npool/source/breakiterator/data/dict_word_nodash.txt
+++ /dev/null
@@ -1,147 +0,0 @@
-#
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
-#
-#   file:  dict_word.txt   
-#
-#   ICU Word Break Rules
-#      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
-#
-
-
-
-####################################################################################
-#
-#  Character class definitions from TR 29
-#
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
-
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
-
-$ALetter   = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW 
PUNCTUATION GERESH:]
-                           - $Ideographic
-                           - $Katakana
-                           - $Hangul
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = 
SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] 
-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL 
LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:] [:name = PRIME:] ];  
-
-$SufixLetter = [:name= FULL STOP:];
-              
-
-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 
[:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE 
QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
-             [:name = PRIME:]];
-$Numeric   = [:LineBreak = Numeric:];
-
-
-$TheZWSP = \u200b;
-
-#
-#  Character Class Definitions.
-#    The names are those from TR29.
-#
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
-
-
-
-
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin 
Here. 
-#
-####################################################################################
-
-$Format    = [[:Cf:] - $TheZWSP];
-
-
-
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in 
Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent 
syllables -
-#          they won't be word boundaries.
-#
-
-
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the 
base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$SufixLetterEx= $SufixLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$IdeographicEx= $Ideographic  $Extend*;
-$HangulEx = $Hangul  $Extend*;
-$FormatEx     = $Format    $Extend*;
-
-
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
-
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* 
$ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | 
$LetterSequence))* $SufixLetterEx? {200};
-
-[[:P:][:S:]]*;
-
-#
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
-
-#
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, 
only so that they
-#                           can be tagged with a return value.   TODO:  is 
this what we want?
-#
-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
-$HangulEx ($FormatEx* $HangulEx)* {400};
-
-#
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
-
-#
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  
too far,
-#                   but must back up at least enough, and must stop on a 
boundary.)
-#
-
-# NonStarters are the set of all characters that can appear at the 2nd - nth 
position of
-#    a word.   (They may also be the first.)   The reverse rule skips over 
these, until it
-#    reaches something that can only be the start (and probably only) char in 
a "word".
-#    A space or punctuation meets the test.
-#
-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] 
$MidLetter $MidNum $SufixLetter $Extend $Format];
-
-#!.*;
-! ($NonStarters* | 
  ) .;
-
diff --git a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt 
b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt
index fb29b478af21..b39503d1b405 100644
--- a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt
+++ b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt
@@ -1,157 +1,221 @@
 #
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (C) 2002-2016, International Business Machines Corporation
+# and others. All Rights Reserved.
 #
-#   file:  dict_word.txt   
+# file:  word.txt
 #
-#   ICU Word Break Rules
+# ICU Word Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
+# Note:  Updates to word.txt will usually need to be merged into
+#        word_POSIX.txt also.
 
-
-
-####################################################################################
+##############################################################################
 #
 #  Character class definitions from TR 29
 #
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
+##############################################################################
 
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
+### BEGIN CUSTOMIZATION
+### This file contains LibreOffice-specific rule customizations.
+###
+### To aid future maintainability:
+### - The change location should be bracketed by comments of this form.
+### - The original rule should be commented out, and the modified rule placed 
alongside.
+### - By doing this, maintainers can more easily compare to an upstream 
baseline.
+###
+### END CUSTOMIZATION
 
-# list of dashes or hyphens that should be accepted as part of the word if a 
single one of these
-# pre- or postfixes a word. E.g. in German: "Arbeits-" or "-nehmer" where that 
hyphen needs to
-# be part of the word in order to have it properly spell checked etc.
-$PrePostDashHyphen = [ [:name = HYPHEN-MINUS:] ];
+!!chain;
+!!quoted_literals_only;
 
 
-$ALetter   = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW 
PUNCTUATION GERESH:]
-                           - $Ideographic
-                           - $Katakana
-                           - $Hangul
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = 
SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] 
-              [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL 
LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:] [:name = PRIME:] 
-              [:name = HYPHEN-MINUS:] ];
+#
+#  Character Class Definitions.
+#
 
-$SufixLetter = [:name= FULL STOP:];
-              
+$Han                = [:Han:];
 
-$MidNum    = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 
[:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
-             [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE 
QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
-             [:name = PRIME:]];
-$Numeric   = [:LineBreak = Numeric:];
+$CR                 = [\p{Word_Break = CR}];
+$LF                 = [\p{Word_Break = LF}];
+$Newline            = [\p{Word_Break = Newline}];
+$Extend             = [\p{Word_Break = Extend}-$Han];
+$ZWJ                = [\p{Word_Break = ZWJ}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format             = [\p{Word_Break = Format}];
+$Katakana           = [\p{Word_Break = Katakana}];
+$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+$ALetter            = [\p{Word_Break = ALetter}];
+$Single_Quote       = [\p{Word_Break = Single_Quote}];
+$Double_Quote       = [\p{Word_Break = Double_Quote}];
+$MidNumLet          = [\p{Word_Break = MidNumLet}];
+$MidNum             = [\p{Word_Break = MidNum}];
+$Numeric            = [\p{Word_Break = Numeric}];
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+$WSegSpace          = [\p{Word_Break = WSegSpace}];
+$Extended_Pict      = [\p{Extended_Pictographic}];
 
+### BEGIN CUSTOMIZATION
+### Unknown issue number: Dictionary words can contain hyphens
+### tdf#49885: Sync custom BreakIterator rules with ICU originals
+### - ICU is now more permissive about punctuation inside words.
+### - For compatibility, exclude certain characters that were previously 
excluded.
 
-$TheZWSP = \u200b;
+$IncludedML         = [:name = HYPHEN-MINUS:];
+$ExcludedML         = [[:name = COLON:]
+                       [:name = GREEK ANO TELEIA:]
+                       [:name = PRESENTATION FORM FOR VERTICAL COLON:]
+                       [:name = SMALL COLON:]
+                       [:name = FULLWIDTH COLON:]];
 
-#
-#  Character Class Definitions.
-#    The names are those from TR29.
-#
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
+# $MidLetter          = [\p{Word_Break = MidLetter}];
+$MidLetter          = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML];
 
+### END CUSTOMIZATION
 
+### BEGIN CUSTOMIZATION
+### Unknown issue number: Allow leading and trailing hyphens in certain 
languages
+### This part of the customization does not replace any rules.
 
+$PrePostHyphen      = [:name = HYPHEN-MINUS:];
 
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin 
Here. 
-#
-####################################################################################
+### END CUSTOMIZATION
 
-$Format    = [[:Cf:] - $TheZWSP];
+$Hiragana           = [:Hiragana:];
+$Ideographic        = [\p{Ideographic}];
 
 
+#   Dictionary character set, for triggering language-based break engines. 
Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in 
Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include 
all
+#   characters requiring dictionary break.
 
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in 
Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent 
syllables -
-#          they won't be word boundaries.
-#
+$Control        = [\p{Grapheme_Cluster_Break = Control}];
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$KanaKanji $HangulSyllable];
+$dictionary     = [$ComplexContext $dictionaryCJK];
 
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
 
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the 
base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$SufixLetterEx= $SufixLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$IdeographicEx= $Ideographic  $Extend*;
-$HangulEx = $Hangul  $Extend*;
-$FormatEx     = $Format    $Extend*;
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
 
 
+## -------------------------------------------------
+
+# Rule 3 - CR x LF
 #
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
+$CR $LF;
 
+# Rule 3c   Do not break within emoji zwj sequences.
+#             ZWJ ×  \p{Extended_Pictographic}.  Precedes WB4, so no 
intervening Extend chars allowed.
 #
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
+$ZWJ $Extended_Pict;
+
+# Rule 3d - Keep horizontal whitespace together.
 #
-# At most one leading or trailing dash/hyphen should be accepted as well.
-# E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to
-# be part of the word in order to have it properly spell checked etc.
-$LetterSequence = $PrePostDashHyphen? $ALetterEx ($FormatEx* $MidLetterEx? 
$FormatEx* $ALetterEx)* $PrePostDashHyphen?;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | 
$LetterSequence))* $SufixLetterEx? {200};
+$WSegSpace $WSegSpace;
 
-[[:P:][:S:]]*;
+# Rule 4 - ignore Format and Extend characters, except when they appear at the 
beginning
+#          of a region of Text.
 
-#
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
+$ExFm  = [$Extend $Format $ZWJ];
 
-#
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, 
only so that they
-#                           can be tagged with a return value.   TODO:  is 
this what we want?
-#
-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
-$HangulEx ($FormatEx* $HangulEx)* {400};
+^$ExFm+;            # This rule fires only when there are format or extend 
characters at the
+                    # start of text, or immediately following another 
boundary. It groups them, in
+                    # the event there are more than one.
 
-#
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
+[^$CR $LF $Newline $ExFm] $ExFm*;   # This rule rule attaches trailing 
format/extends to words,
+                                    # with no special rule status value.
+
+$Numeric $ExFm* {100};              # This group of rules also attach trailing 
format/extends, but
+$ALetterPlus $ExFm* {200};          # with rule status set based on the word's 
final base character.
+$HangulSyllable {200};
+$Hebrew_Letter $ExFm* {200};
+$Katakana $ExFm* {400};             # note:  these status values override 
those from rule 5
+$Hiragana $ExFm* {400};             #        by virtue of being numerically 
larger.
+$Ideographic $ExFm* {400};          #
 
 #
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  
too far,
-#                   but must back up at least enough, and must stop on a 
boundary.)
+# rule 5
+#    Do not break between most letters.
 #
 
-# NonStarters are the set of all characters that can appear at the 2nd - nth 
position of
-#    a word.   (They may also be the first.)   The reverse rule skips over 
these, until it
-#    reaches something that can only be the start (and probably only) char in 
a "word".
-#    A space or punctuation meets the test.
+### BEGIN CUSTOMIZATION
+### Unknown issue number: Allow leading and trailing hyphens in certain 
languages
+
+# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
+($PrePostHyphen) ? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | 
$Hebrew_Letter) ($PrePostHyphen)?;
+
+### END CUSTOMIZATION
+
+# rule 6 and 7
+
+### BEGIN CUSTOMIZATION
+### Unknown issue number: Allow leading and trailing hyphens in certain 
languages
+
+# ($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | 
$Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
+($PrePostHyphen)? ($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | 
$MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) 
($PrePostHyphen)? {200};
+
+### END CUSTOMIZATION
+
+# rule 7a
+$Hebrew_Letter $ExFm* $Single_Quote {200};
+
+# rule 7b and 7c
+$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
+
+# rule 8
+
+$Numeric $ExFm* $Numeric;
+
+# rule 9
+
+($ALetterPlus | $Hebrew_Letter)  $ExFm* $Numeric;
+
+# rule 10
+
+$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 11 and 12
+
+$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
+
+# rule 13
+# to be consistent with $KanaKanji $KanaKanhi, changed
+# from 300 to 400.
+# See also TestRuleStatus in intltest/rbbiapts.cpp
+$Katakana $ExFm*  $Katakana {400};
+
+# rule 13a/b
+
+$ALetterPlus   $ExFm* $ExtendNumLet {200};    #  (13a)
+$Hebrew_Letter $ExFm* $ExtendNumLet {200};    #  (13a)
+$Numeric       $ExFm* $ExtendNumLet {100};    #  (13a)
+$Katakana      $ExFm* $ExtendNumLet {400};    #  (13a)
+$ExtendNumLet  $ExFm* $ExtendNumLet {200};    #  (13a)
+
+$ExtendNumLet  $ExFm* $ALetterPlus  {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Hebrew_Letter {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Numeric      {100};    #  (13b)
+$ExtendNumLet  $ExFm* $Katakana     {400};    #  (13b)
+
+# rules 15 - 17
+#    Pairs of Regional Indicators stay together.
+#    With incoming rule chaining disabled by ^, this rule will match exactly 
two of them.
+#    No other rule begins with a Regional_Indicator, so chaining cannot extend 
the match.
 #
-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] 
$MidLetter $MidNum $SufixLetter $Extend $Format];
+^$Regional_Indicator $ExFm* $Regional_Indicator;
 
-#!.*;
-! ($NonStarters* | 
  ) .;
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji 
found
 
+# Rule 999
+#     Match a single code point if no other rule applies.
+.;
diff --git a/i18npool/source/breakiterator/data/edit_word.txt 
b/i18npool/source/breakiterator/data/edit_word.txt
index 92b344c19d41..14fc221aa96e 100644
--- a/i18npool/source/breakiterator/data/edit_word.txt
+++ b/i18npool/source/breakiterator/data/edit_word.txt
@@ -1,142 +1,199 @@
 #
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (C) 2002-2016, International Business Machines Corporation
+# and others. All Rights Reserved.
 #
-#   file:  edit_word.txt   
+# file:  word.txt
 #
-#   ICU Word Break Rules
+# ICU Word Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
+# Note:  Updates to word.txt will usually need to be merged into
+#        word_POSIX.txt also.
 
-
-
-####################################################################################
+##############################################################################
 #
 #  Character class definitions from TR 29
 #
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
-
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
-
-$ALetter   = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW 
PUNCTUATION GERESH:] 
-                           - $Ideographic
-                           - $Katakana
-                           - $Hangul
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]  [:name = HEBREW 
PUNCTUATION GERSHAYIM:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:]];  
-              
-$MidNum    = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]];
-$Numeric   = [:LineBreak = Numeric:];
-
-
-$TheZWSP = \u200b;
+##############################################################################
+
+### BEGIN CUSTOMIZATION
+### This file contains LibreOffice-specific rule customizations.
+###
+### To aid future maintainability:
+### - The change location should be bracketed by comments of this form.
+### - The original rule should be commented out, and the modified rule placed 
alongside.
+### - By doing this, maintainers can more easily compare to an upstream 
baseline.
+###
+### END CUSTOMIZATION
+
+!!chain;
+!!quoted_literals_only;
+
 
 #
 #  Character Class Definitions.
-#    The names are those from TR29.
 #
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
 
+$Han                = [:Han:];
 
+$CR                 = [\p{Word_Break = CR}];
+$LF                 = [\p{Word_Break = LF}];
+$Newline            = [\p{Word_Break = Newline}];
+$Extend             = [\p{Word_Break = Extend}-$Han];
+$ZWJ                = [\p{Word_Break = ZWJ}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format             = [\p{Word_Break = Format}];
+$Katakana           = [\p{Word_Break = Katakana}];
+$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+$ALetter            = [\p{Word_Break = ALetter}];
+$Single_Quote       = [\p{Word_Break = Single_Quote}];
+$Double_Quote       = [\p{Word_Break = Double_Quote}];
+$MidLetter          = [\p{Word_Break = MidLetter}];
+$MidNum             = [\p{Word_Break = MidNum}];
+$Numeric            = [\p{Word_Break = Numeric}];
+$WSegSpace          = [\p{Word_Break = WSegSpace}];
+$Extended_Pict      = [\p{Extended_Pictographic}];
 
+### BEGIN CUSTOMIZATION
+### i#13494: For the purposes of editing, standalone punctuation should be 
treated as a word.
+### This change subtracts undesired characters from the above families
 
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin 
Here. 
-#
-####################################################################################
+# $MidNumLet          = [\p{Word_Break = MidNumLet}];
+$MidNumLet          = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]];
 
-$Format    = [[:Cf:] - $TheZWSP];
+# $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]];
 
+### END CUSTOMIZATION
 
+$Hiragana           = [:Hiragana:];
+$Ideographic        = [\p{Ideographic}];
 
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in 
Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent 
syllables -
-#          they won't be word boundaries.
-#
 
+#   Dictionary character set, for triggering language-based break engines. 
Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in 
Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include 
all
+#   characters requiring dictionary break.
 
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the 
base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$IdeographicEx= $Ideographic  $Extend*;
-$HangulEx = $Hangul  $Extend*;
-$FormatEx     = $Format    $Extend*;
+$Control        = [\p{Grapheme_Cluster_Break = Control}];
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$KanaKanji $HangulSyllable];
+$dictionary     = [$ComplexContext $dictionaryCJK];
 
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
 
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
 
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* 
$ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | 
$LetterSequence))* {200};
 
-# Punctuations by themselves
-[[:P:][:S:]-[:name = FULL STOP:]]*;
-[[:name = FULL STOP:]]*;
+## -------------------------------------------------
 
+# Rule 3 - CR x LF
 #
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
+$CR $LF;
 
+# Rule 3c   Do not break within emoji zwj sequences.
+#             ZWJ ×  \p{Extended_Pictographic}.  Precedes WB4, so no 
intervening Extend chars allowed.
 #
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, 
only so that they
-#                           can be tagged with a return value.   TODO:  is 
this what we want?
-#
-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
-$HangulEx ($FormatEx* $HangulEx)* {400};
+$ZWJ $Extended_Pict;
 
+# Rule 3d - Keep horizontal whitespace together.
 #
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
+$WSegSpace $WSegSpace;
+
+# Rule 4 - ignore Format and Extend characters, except when they appear at the 
beginning
+#          of a region of Text.
+
+$ExFm  = [$Extend $Format $ZWJ];
+
+^$ExFm+;            # This rule fires only when there are format or extend 
characters at the
+                    # start of text, or immediately following another 
boundary. It groups them, in
+                    # the event there are more than one.
+
+[^$CR $LF $Newline $ExFm] $ExFm*;   # This rule rule attaches trailing 
format/extends to words,
+                                    # with no special rule status value.
+
+$Numeric $ExFm* {100};              # This group of rules also attach trailing 
format/extends, but
+$ALetterPlus $ExFm* {200};          # with rule status set based on the word's 
final base character.
+$HangulSyllable {200};
+$Hebrew_Letter $ExFm* {200};
+$Katakana $ExFm* {400};             # note:  these status values override 
those from rule 5
+$Hiragana $ExFm* {400};             #        by virtue of being numerically 
larger.
+$Ideographic $ExFm* {400};          #
 
 #
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  
too far,
-#                   but must back up at least enough, and must stop on a 
boundary.)
+# rule 5
+#    Do not break between most letters.
 #
+($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 6 and 7
+($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | 
$Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
+
+# rule 7a
+$Hebrew_Letter $ExFm* $Single_Quote {200};
+
+# rule 7b and 7c
+$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
+
+# rule 8
+
+$Numeric $ExFm* $Numeric;
+
+# rule 9
+
+($ALetterPlus | $Hebrew_Letter)  $ExFm* $Numeric;
 
-# NonStarters are the set of all characters that can appear at the 2nd - nth 
position of
-#    a word.   (They may also be the first.)   The reverse rule skips over 
these, until it
-#    reaches something that can only be the start (and probably only) char in 
a "word".
-#    A space or punctuation meets the test.
+# rule 10
+
+$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 11 and 12
+
+$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
+
+# rule 13
+# to be consistent with $KanaKanji $KanaKanhi, changed
+# from 300 to 400.
+# See also TestRuleStatus in intltest/rbbiapts.cpp
+$Katakana $ExFm*  $Katakana {400};
+
+# rule 13a/b
+
+$ALetterPlus   $ExFm* $ExtendNumLet {200};    #  (13a)
+$Hebrew_Letter $ExFm* $ExtendNumLet {200};    #  (13a)
+$Numeric       $ExFm* $ExtendNumLet {100};    #  (13a)
+$Katakana      $ExFm* $ExtendNumLet {400};    #  (13a)
+$ExtendNumLet  $ExFm* $ExtendNumLet {200};    #  (13a)
+
+$ExtendNumLet  $ExFm* $ALetterPlus  {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Hebrew_Letter {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Numeric      {100};    #  (13b)
+$ExtendNumLet  $ExFm* $Katakana     {400};    #  (13b)
+
+# rules 15 - 17
+#    Pairs of Regional Indicators stay together.
+#    With incoming rule chaining disabled by ^, this rule will match exactly 
two of them.
+#    No other rule begins with a Regional_Indicator, so chaining cannot extend 
the match.
 #
-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] 
$MidLetter $MidNum $Extend $Format];
+^$Regional_Indicator $ExFm* $Regional_Indicator;
 
-#!.*;
-! ($NonStarters* | 
  ) .;
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji 
found
+
+### BEGIN CUSTOMIZATION
+### i#13494: For the purposes of editing, standalone punctuation should be 
treated as a word.
+### This customization does not replace any rules.
+[[:P:][:S:]-[:name = FULL STOP:]]*
+[[:name = FULL STOP:]]*;
+### END CUSTOMIZATION
 
+# Rule 999
+#     Match a single code point if no other rule applies.
+.;
diff --git a/i18npool/source/breakiterator/data/edit_word_he.txt 
b/i18npool/source/breakiterator/data/edit_word_he.txt
deleted file mode 100644
index 0b5908814e08..000000000000
--- a/i18npool/source/breakiterator/data/edit_word_he.txt
+++ /dev/null
@@ -1,142 +0,0 @@
-#
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
-#
-#   file:  edit_word.txt   
-#
-#   ICU Word Break Rules
-#      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
-#
-
-
-
-####################################################################################
-#
-#  Character class definitions from TR 29
-#
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
-
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
-
-$ALetter   = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW 
PUNCTUATION GERESH:] 
-                           - $Ideographic
-                           - $Katakana
-                           - $Hangul
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = MIDDLE 
DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:]];  
-              
-$MidNum    = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]];
-$Numeric   = [:LineBreak = Numeric:];
-
-
-$TheZWSP = \u200b;
-
-#
-#  Character Class Definitions.
-#    The names are those from TR29.
-#
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
-
-
-
-
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin 
Here. 
-#
-####################################################################################
-
-$Format    = [[:Cf:] - $TheZWSP];
-
-
-
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in 
Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent 
syllables -
-#          they won't be word boundaries.
-#
-
-
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the 
base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$IdeographicEx= $Ideographic  $Extend*;
-$HangulEx = $Hangul  $Extend*;
-$FormatEx     = $Format    $Extend*;
-
-
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
-
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* 
$ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | 
$LetterSequence))* {200};
-
-# Punctuations by themselves
-[[:P:][:S:]-[:name = FULL STOP:]]*;
-[[:name = FULL STOP:]]*;
-
-#
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
-
-#
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, 
only so that they
-#                           can be tagged with a return value.   TODO:  is 
this what we want?
-#
-$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
-$HangulEx ($FormatEx* $HangulEx)* {400};
-
-#
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
-
-#
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  
too far,
-#                   but must back up at least enough, and must stop on a 
boundary.)
-#
-
-# NonStarters are the set of all characters that can appear at the 2nd - nth 
position of
-#    a word.   (They may also be the first.)   The reverse rule skips over 
these, until it
-#    reaches something that can only be the start (and probably only) char in 
a "word".
-#    A space or punctuation meets the test.
-#
-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] 
$MidLetter $MidNum $Extend $Format];
-
-#!.*;
-! ($NonStarters* | 
  ) .;
-
diff --git a/i18npool/source/breakiterator/data/edit_word_hu.txt 
b/i18npool/source/breakiterator/data/edit_word_hu.txt
index 4a08acab0029..389ad2bacc13 100644
--- a/i18npool/source/breakiterator/data/edit_word_hu.txt
+++ b/i18npool/source/breakiterator/data/edit_word_hu.txt
@@ -1,159 +1,215 @@
 #
-#   Copyright (C) 2002-2003, International Business Machines Corporation and 
others.
-#       All Rights Reserved.
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (C) 2002-2016, International Business Machines Corporation
+# and others. All Rights Reserved.
 #
-#   file:  edit_word.txt   
+# file:  word.txt
 #
-#   ICU Word Break Rules
+# ICU Word Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on Version 4.0.0, dated 2003-04-17
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
+# Note:  Updates to word.txt will usually need to be merged into
+#        word_POSIX.txt also.
 
-
-
-####################################################################################
+##############################################################################
 #
 #  Character class definitions from TR 29
 #
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND 
MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA 
PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND 
MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED 
SOUND MARK:]];
-
-$Ideographic = [:Ideographic:];
-$Hangul = [:Script = HANGUL:];
-
-$ALetter   = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW 
PUNCTUATION GERESH:] 
-                [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER 
TEN THOUSAND SIGN:]
-                [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO 
SIGN:]
-                [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:]
-                [:name = DIGIT ZERO:]
-                [:name = DIGIT ONE:]
-                [:name = DIGIT TWO:]
-                [:name = DIGIT THREE:]
-                [:name = DIGIT FOUR:]
-                [:name = DIGIT FIVE:]
-                [:name = DIGIT SIX:]
-                [:name = DIGIT SEVEN:]
-                [:name = DIGIT EIGHT:]
-                [:name = DIGIT NINE:]
-                           - $Ideographic
-                           - $Katakana
-                           - $Hangul
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]  [:name = HEBREW 
PUNCTUATION GERSHAYIM:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION 
POINT:]  
-              [:name = HYPHEN-MINUS:] [:name = EURO SIGN:] [:name = PERCENT 
SIGN:] 
-              [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:]
-              [:name = EN DASH:] [:name = EM DASH:]
-              [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE 
SIGN:]];
-              
-$MidNum    = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]];
-e 
... etc. - the rest is truncated

core.git: i18npool/CustomTarget_breakiterator.mk i18npool/qa i18npool/source

Reply via email to