i18npool/CustomTarget_breakiterator.mk | 6 i18npool/qa/cppunit/test_breakiterator.cxx | 354 +++-- i18npool/source/breakiterator/data/dict_word.txt | 267 ++-- i18npool/source/breakiterator/data/dict_word_he.txt | 139 -- i18npool/source/breakiterator/data/dict_word_hu.txt | 324 ++--- i18npool/source/breakiterator/data/dict_word_nodash.txt | 147 -- i18npool/source/breakiterator/data/dict_word_prepostdash.txt | 288 ++-- i18npool/source/breakiterator/data/edit_word.txt | 261 ++-- i18npool/source/breakiterator/data/edit_word_he.txt | 142 -- i18npool/source/breakiterator/data/edit_word_hu.txt | 294 ++-- i18npool/source/breakiterator/data/line.txt | 680 +++-------- i18npool/source/breakiterator/data/sent.txt | 128 -- 12 files changed, 1306 insertions(+), 1724 deletions(-)
New commits: commit 44699b3de37f07090ac6fee1cd97aa76036e9700 Author: Jonathan Clark <[email protected]> AuthorDate: Wed Apr 17 09:09:50 2024 -0600 Commit: Caolán McNamara <[email protected]> CommitDate: Thu May 9 21:23:50 2024 +0200 tdf#49885 BreakIterator rule upgrades This change re-bases the BreakIterator rule customizations on top of a clean copy of the ICU 74.2 rules. Change-Id: Iadcf16cab138cc6c869fac61ad64e996e65b5ae4 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/166273 Tested-by: Jenkins Tested-by: Caolán McNamara <[email protected]> Reviewed-by: Caolán McNamara <[email protected]> diff --git a/i18npool/CustomTarget_breakiterator.mk b/i18npool/CustomTarget_breakiterator.mk index aaba3c1503de..095672878f3d 100644 --- a/i18npool/CustomTarget_breakiterator.mk +++ b/i18npool/CustomTarget_breakiterator.mk @@ -16,16 +16,12 @@ $(call gb_CustomTarget_get_target,i18npool/breakiterator) : \ i18npool_BRKTXTS := \ count_word.brk \ - $(call gb_Helper_optional_locale,he,dict_word_he.brk) \ $(call gb_Helper_optional_locale,hu,dict_word_hu.brk) \ - dict_word_nodash.brk \ dict_word_prepostdash.brk \ dict_word.brk \ - $(call gb_Helper_optional_locale,he,edit_word_he.brk) \ $(call gb_Helper_optional_locale,hu,edit_word_hu.brk) \ edit_word.brk \ - line.brk \ - sent.brk + line.brk # 'gencmn', 'genbrk' and 'genccode' are tools generated and delivered by icu project to process icu breakiterator rules. # The output of gencmn generates warnings under Windows. We want to minimize the patches to external tools, diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index 2ac46f9bdca6..0103637989e4 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -184,11 +184,10 @@ void TestBreakIterator::testLineBreaking() { // Per the bug, the line break should leave -bar clumped together on the next line. - // However, this change was reverted at some point. This test asserts the new behavior. i18n::LineBreakResults aResult = m_xBreak->getLineBreak( u"foo -bar"_ustr, strlen("foo -ba"), aLocale, 0, aHyphOptions, aUserOptions); CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first dash", - static_cast<sal_Int32>(5), aResult.breakIndex); + static_cast<sal_Int32>(4), aResult.breakIndex); } } @@ -198,11 +197,29 @@ void TestBreakIterator::testLineBreaking() aLocale.Country = "US"; { - // Here we want the line break to leave C:\Program Files\ on the first line + // Note that the current behavior deviates from the original fix for this bug. + // + // The original report was filed due to wrapping all of "\Program Filesaaa" to the + // next line, even though only "aaaa" overflowed. The original fix was to simply make + // U+005C reverse solidus (backslash) a breaking character. + // + // However, the root cause for this bug was not the behavior of '\', but rather some + // other bug making all of "\Program Files\" behave like a single token, despite it + // even containing whitespace. + // + // Reverting to the ICU line rules fixes this root issue. Now, in the following, + // "C:\Program" and "Files\LibreOffice" are treated as separate tokens. This is also + // consistent with the behavior of other office programs. i18n::LineBreakResults aResult = m_xBreak->getLineBreak( u"C:\Program Files\LibreOffice"_ustr, strlen("C:\Program Files\Libre"), aLocale, 0, aHyphOptions, aUserOptions); - CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), aResult.breakIndex); + + // An identical result should be generated for solidus. + aResult = m_xBreak->getLineBreak( + "C:/Program Files/LibreOffice", strlen("C:/Program Files/Libre"), aLocale, 0, + aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), aResult.breakIndex); } } @@ -251,23 +268,125 @@ void TestBreakIterator::testLineBreaking() aLocale.Country = "US"; { + // The root cause for this bug was the Unicode standard introducing special treatment + // for '-' in a number range context. This change makes number ranges (e.g. "100-199") + // behave as if they are single tokens for the purposes of line breaking. Unfortunately, + // this caused a significant appearance change to existing documents. + // + // Despite being a user-visible layout change, this isn't exactly a bug. Wrapping + // number ranges as a single token is consistent with other applications, including web + // browsers, and other office suites as mentioned in the bug discussion. Removing this + // customization seems like it would be a major change, however. + // // Here we want the line break to leave 100- clumped on the first line. + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( u"word 100-199 word"_ustr, strlen("word 100-1"), aLocale, 0, aHyphOptions, aUserOptions); CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(9), aResult.breakIndex); } - } - // i#83649: Line break should be between typographical quote and left bracket - { + { + // From the same bug: "the leading minus must stay with numbers and strings" + + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "range of -100.000 to 100.000", strlen("range of -1"), aLocale, 0, + aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex); + + constexpr OUString str = u"range of \u2212100.000 to 100.000"_ustr; + aResult = m_xBreak->getLineBreak( + str, strlen("range of -"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex); + } + aLocale.Language = "de"; aLocale.Country = "DE"; { - // Here we want the line break to leave »angetan werden« on the first line + // From the same bug: "the leading minus must stay with numbers and strings" + + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "EURO is -10,50", strlen("EURO is -1"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex); + + // Also the mathematical minus sign: + + constexpr OUString str = u"EURO is \u221210,50"_ustr; + aResult = m_xBreak->getLineBreak( + str, strlen("EURO is -"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex); + } + + { + // From the same bug: "the leading minus must stay with numbers and strings" + + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "und -kosten", strlen("und -ko"), aLocale, 0, + aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32{4}, aResult.breakIndex); + + // But not the non-breaking hyphen: + + constexpr OUString str = u"und \u2011"_ustr; + aResult = m_xBreak->getLineBreak( + str, strlen("und -ko"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32{5}, aResult.breakIndex); + } + } + + // i#83649: "Line break should be between typographical quote and left bracket" + // - Actually: Spaces between quotation mark and opening punctuation not treated as a break. + // - Note that per the Unicode standard, prohibiting breaks in this context is intentional + // because it may cause issues in certain languages due to the various ways quotation + // characters are used. + // - We do it anyway by customizing the ICU line breaking rules. + { + { + // This uses the sample text provided in the bug report. Based on usage, it is assumed + // they were in the de_DE locale. + + aLocale.Language = "de"; + aLocale.Country = "DE"; + + // Per the bug report, it is expected that »angetan werden« remains on the first line. const OUString str = u"»angetan werden« [Passiv]"_ustr; i18n::LineBreakResults aResult = m_xBreak->getLineBreak( - str, strlen("Xangetan werdenX ["), aLocale, 0, aHyphOptions, aUserOptions); + str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex); + + // The same result should be returned for this and the first case. + const OUString str2 = u"»angetan werden« Passiv"_ustr; + aResult = m_xBreak->getLineBreak( + str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex); + + // Under ICU rules, no amount of spaces would cause this to wrap. + const OUString str3 = u"»angetan werden« [Passiv]"_ustr; + aResult = m_xBreak->getLineBreak( + str3, str3.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(20), aResult.breakIndex); + + // However, tabs will + const OUString str4 = u"»angetan werden« [Passiv]"_ustr; + aResult = m_xBreak->getLineBreak( + str4, str4.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex); + } + + { + // The same behavior is seen in English + + aLocale.Language = "en"; + aLocale.Country = "US"; + + const OUString str = u"\"angetan werden\" [Passiv]"_ustr; + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex); + + const OUString str2 = u"\"angetan werden\" Passiv"_ustr; + aResult = m_xBreak->getLineBreak( + str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex); } } @@ -355,7 +474,7 @@ void TestBreakIterator::testLineBreaking() auto res = m_xBreak->getLineBreak(u"Wort -prinzessinnen, wort"_ustr, strlen("Wort -prinzessinnen,"), aLocale, 0, aHyphOptions, aUserOptions); - CPPUNIT_ASSERT_EQUAL(sal_Int32{ 6 }, res.breakIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32{ 5 }, res.breakIndex); } } } @@ -638,7 +757,8 @@ void TestBreakIterator::testWordBoundaries() CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i); } - //See https://bz.apache.org/ooo/show_bug.cgi?id=85411 + // i#85411: ZWSP should be a word separator for spellchecking + // - This fix was applied to both dict and edit customizations for (int j = 0; j < 3; ++j) { switch (j) @@ -660,21 +780,23 @@ void TestBreakIterator::testWordBoundaries() break; } - static constexpr OUString aTest = - u"I\u200Bwant\u200Bto\u200Bgo"_ustr; + static constexpr OUString aTest = u"I\u200Bwant\u200Bto\u200Bgo"_ustr; sal_Int32 nPos = 0; - sal_Int32 aExpected[] = {1, 6, 9, 12}; + sal_Int32 aExpected[] = { 1, 6, 9, 12 }; size_t i = 0; do { CPPUNIT_ASSERT(i < std::size(aExpected)); - nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, - i18n::WordType::DICTIONARY_WORD, true).endPos; - CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos); + auto dwPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(aExpected[i], dwPos.endPos); + auto ewPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(aExpected[i], ewPos.endPos); + nPos = dwPos.endPos; ++i; - } - while (nPos++ < aTest.getLength()); + } while (nPos++ < aTest.getLength()); CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i); } @@ -814,121 +936,45 @@ void TestBreakIterator::testWordBoundaries() } // i#56347: "BreakIterator patch for Hungarian" - // Rules for Hungarian affixes after numbers and certain symbols - { - auto mode = i18n::WordType::DICTIONARY_WORD; - aLocale.Language = "hu"; - aLocale.Country = "HU"; - - OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr; - - aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); - } - // i#56348: Special chars in first pos not handled by spell checking in Writer (Hungarian) - // Rules for Hungarian affixes after numbers and certain symbols in edit mode. - // The patch was merged, but the original bug was never closed and the current behavior seems - // identical to the ICU default behavior. Added this test to ensure that doesn't change. + // Rules for Hungarian affixes after numbers and certain symbols { - auto mode = i18n::WordType::ANY_WORD; aLocale.Language = "hu"; aLocale.Country = "HU"; OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr; - aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 17, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 19, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 20, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 21, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); + for (auto mode : + { i18n::WordType::DICTIONARY_WORD, i18n::WordType::ANYWORD_IGNOREWHITESPACES }) + { + aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); - aBounds = m_xBreak->getWordBoundary(aTest, 24, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.endPos); + aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); - aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.endPos); + aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); - aBounds = m_xBreak->getWordBoundary(aTest, 26, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.endPos); + aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); - aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); + aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); - aBounds = m_xBreak->getWordBoundary(aTest, 30, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.endPos); + aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); - aBounds = m_xBreak->getWordBoundary(aTest, 31, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); + aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); + } } // tdf#49885: Upgrade CJ word boundary analysis to ICU frequency-based analysis @@ -983,6 +1029,56 @@ void TestBreakIterator::testSentenceBoundaries() CPPUNIT_ASSERT_EQUAL(sal_Int32(24), m_xBreak->beginOfSentence(aTest, 26, aLocale)); CPPUNIT_ASSERT_EQUAL(sal_Int32(53), m_xBreak->endOfSentence(aTest, 26, aLocale)); } + + // i#55063: Sentence selection in Thai should select a space-delimited phrase. + // - This customization broke at some point. It works in an English locale in a synthetic test + // like this one, but does not work in the Thai locale, nor on Thai text in practice. + { + static constexpr OUString aTest = u"ว้อย โหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอก โปรโมเตอร์"_ustr; + + aLocale.Language = "en"; + aLocale.Country = "US"; + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale)); + + aLocale.Language = "th"; + aLocale.Country = "TH"; + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale)); + } + + // i#55063: Thai phrases should delimit English sentence selection. + // - This customization broke at some point. It works in an English locale in a synthetic test + // like this one, but does not work in the Thai locale, nor on Thai text in practice. + { + static constexpr OUString aTest = u"ว้อย English usually ends with a period โปรโมเตอร์."_ustr; + + aLocale.Language = "en"; + aLocale.Country = "US"; + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale)); + + aLocale.Language = "th"; + aLocale.Country = "TH"; + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale)); + } + + // i#55063: Characteristic test for English text delimiting Thai phrases (sentences) + // - English text should not delimit Thai phrases. + { + static constexpr OUString aTest = u"Englishโหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอกEnglish"_ustr; + + aLocale.Language = "en"; + aLocale.Country = "US"; + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale)); + + aLocale.Language = "th"; + aLocale.Country = "TH"; + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale)); + } } //See https://bugs.libreoffice.org/show_bug.cgi?id=40292 @@ -1559,6 +1655,7 @@ void TestBreakIterator::testLegacyHebrewQuoteInsideWord() aLocale.Language = "he"; aLocale.Country = "IL"; + // i#51661: Add quotation mark as middle letter for Hebrew { auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr; @@ -1572,6 +1669,21 @@ void TestBreakIterator::testLegacyHebrewQuoteInsideWord() CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); } + + // i#51661: Add quotation mark as middle letter for Hebrew + { + auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr; + + i18n::Boundary aBounds = m_xBreak->getWordBoundary( + aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); + } } void TestBreakIterator::testLegacySurrogatePairs() diff --git a/i18npool/source/breakiterator/data/dict_word.txt b/i18npool/source/breakiterator/data/dict_word.txt index b1666f44daab..f804b0eec214 100644 --- a/i18npool/source/breakiterator/data/dict_word.txt +++ b/i18npool/source/breakiterator/data/dict_word.txt @@ -1,148 +1,199 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: dict_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] - [:name = HYPHEN-MINUS:] ]; - -$SufixLetter = [:name= FULL STOP:]; - - -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; +############################################################################## + +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION + +!!chain; +!!quoted_literals_only; + # # Character Class Definitions. -# The names are those from TR29. # -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; +$Han = [:Han:]; +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$ALetter = [\p{Word_Break = ALetter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; +### BEGIN CUSTOMIZATION +### Unknown issue number: Dictionary words can contain hyphens +### tdf#49885: Sync custom BreakIterator rules with ICU originals +### - ICU is now more permissive about punctuation inside words. +### - For compatibility, exclude certain characters that were previously excluded. -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### +$IncludedML = [:name = HYPHEN-MINUS:]; +$ExcludedML = [[:name = COLON:] + [:name = GREEK ANO TELEIA:] + [:name = PRESENTATION FORM FOR VERTICAL COLON:] + [:name = SMALL COLON:] + [:name = FULLWIDTH COLON:]]; -$Format = [[:Cf:] - $TheZWSP]; +# $MidLetter = [\p{Word_Break = MidLetter}]; +$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML]; +### END CUSTOMIZATION +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; +# TODO: check if handling of katakana in dictionary makes rules incorrect/void -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; -[[:P:][:S:]]*; +## ------------------------------------------------- +# Rule 3 - CR x LF # -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; +$CR $LF; +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. # -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; +$ZWJ $Extended_Pict; +# Rule 3d - Keep horizontal whitespace together. # -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; +$WSegSpace $WSegSpace; + +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. + +$ExFm = [$Extend $Format $ZWJ]; + +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. + +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. + +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # # -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) +# rule 5 +# Do not break between most letters. # +($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 6 and 7 +($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; + +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; + +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; + +# rule 8 + +$Numeric $ExFm* $Numeric; + +# rule 9 + +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; + +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. # -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; +^$Regional_Indicator $ExFm* $Regional_Indicator; -#!.*; -! ($NonStarters* | ) .; +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found +# Rule 999 +# Match a single code point if no other rule applies. +.; diff --git a/i18npool/source/breakiterator/data/dict_word_he.txt b/i18npool/source/breakiterator/data/dict_word_he.txt deleted file mode 100644 index 40197d92a431..000000000000 --- a/i18npool/source/breakiterator/data/dict_word_he.txt +++ /dev/null @@ -1,139 +0,0 @@ -# -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. -# -# file: dict_word.txt -# -# ICU Word Break Rules -# See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 -# - - - -#################################################################################### -# -# Character class definitions from TR 29 -# -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - - -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - - $Katakana - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:]]; - -$SufixLetter = [:name= FULL STOP:]; - -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; - -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - - - -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### - -$Format = [[:Cf:] - $TheZWSP]; - - - -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# - - -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$FormatEx = $Format $Extend*; - - -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; - -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; - -[[:P:][:S:]]*; - -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; - -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -# [:IDEOGRAPHIC:] $Extend* {400}; - -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; - -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# - -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. -# -$NonStarters = [$Numeric $ALetter $Katakana [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; - -#!.*; -! ($NonStarters* | ) .; - diff --git a/i18npool/source/breakiterator/data/dict_word_hu.txt b/i18npool/source/breakiterator/data/dict_word_hu.txt index b0a0276b36a8..88648e6e5716 100644 --- a/i18npool/source/breakiterator/data/dict_word_hu.txt +++ b/i18npool/source/breakiterator/data/dict_word_hu.txt @@ -1,176 +1,222 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: dict_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - - -# Fix spelling of a)-ban, b)-ben, when the letter is a reference -# resulting bad word breaking "ban" and "ben" -# (reference fields are not expanded in spell checking, yet, only -# for grammar checking). - -$PrefixLetter = [[:name = RIGHT PARENTHESIS:]]; - -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] - [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:] - [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] - [:name = DIGIT ZERO:] - [:name = DIGIT ONE:] - [:name = DIGIT TWO:] - [:name = DIGIT THREE:] - [:name = DIGIT FOUR:] - [:name = DIGIT FIVE:] - [:name = DIGIT SIX:] - [:name = DIGIT SEVEN:] - [:name = DIGIT EIGHT:] - [:name = DIGIT NINE:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:] - [:name = EURO SIGN:] [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] - [:name = EN DASH:] [:name = EM DASH:] - [:name = RIGHT DOUBLE QUOTATION MARK:] - [:name = LEFT PARENTHESIS:] - [:name = RIGHT PARENTHESIS:] - [:name = RIGHT SQUARE BRACKET:] - [:name = EXCLAMATION MARK:] - [:name = QUESTION MARK:] - [:name = FULL STOP:] [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]]; - -$SufixLetter = [:name= FULL STOP:]; - -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; +############################################################################## + +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION + +!!chain; +!!quoted_literals_only; + # # Character Class Definitions. -# The names are those from TR29. # -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - +$Han = [:Han:]; + +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; + +### BEGIN CUSTOMIZATION +### Unknown issue number: Dictionary words can contain hyphens +### tdf#49885: Sync custom BreakIterator rules with ICU originals +### - ICU is now more permissive about punctuation inside words. +### - For compatibility, exclude certain characters that were previously excluded. +### tdf#116072: Extend MidLetter in Hungarian word breaking +### i#56347: BreakIterator patch for Hungarian +### i#56348: Special chars in first pos not handled by spell checking for Hungarian + +$Symbols_hu = [[:name = PERCENT SIGN:] + [:name = PER MILLE SIGN:] + [:name = PER TEN THOUSAND SIGN:] + [:name = SECTION SIGN:] + [:name = DEGREE SIGN:] + [:name = EURO SIGN:] + [:name = HYPHEN-MINUS:] + [:name = EN DASH:] + [:name = EM DASH:]]; + +#$ALetter = [\p{Word_Break = ALetter}]; +$ALetter = [\p{Word_Break = ALetter} $Symbols_hu]; + +$IncludedML = [:name = HYPHEN-MINUS:]; +$ExcludedML = [[:name = COLON:] + [:name = GREEK ANO TELEIA:] + [:name = PRESENTATION FORM FOR VERTICAL COLON:] + [:name = SMALL COLON:] + [:name = FULLWIDTH COLON:]]; + +$IncludedML_hu = [[:name = RIGHT DOUBLE QUOTATION MARK:] + [:name = LEFT PARENTHESIS:] + [:name = RIGHT PARENTHESIS:] + [:name = RIGHT SQUARE BRACKET:] + [:name = EXCLAMATION MARK:] + [:name = QUESTION MARK:] + $Symbols_hu]; + +# $MidLetter = [\p{Word_Break = MidLetter}]; +$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML $IncludedML_hu]; + +### END CUSTOMIZATION + +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; + + +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. + +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; + +# TODO: check if handling of katakana in dictionary makes rules incorrect/void + +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; + + +## ------------------------------------------------- + +# Rule 3 - CR x LF +# +$CR $LF; -#################################################################################### +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. # -# Word Break Rules. Definitions and Rules specific to word break begin Here. +$ZWJ $Extended_Pict; + +# Rule 3d - Keep horizontal whitespace together. # -#################################################################################### +$WSegSpace $WSegSpace; -$Format = [[:Cf:] - $TheZWSP]; +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. +$ExFm = [$Extend $Format $ZWJ]; +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # # -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. +# rule 5 +# Do not break between most letters. # -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; +($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); +# rule 6 and 7 +($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $PrefixLetter? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; -[[:P:][:S:]]*; +# rule 8 -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; +$Numeric $ExFm* $Numeric; -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; +# rule 9 -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. # -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; +^$Regional_Indicator $ExFm* $Regional_Indicator; -#!.*; -! ($NonStarters* | ) .; +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found +# Rule 999 +# Match a single code point if no other rule applies. +.; diff --git a/i18npool/source/breakiterator/data/dict_word_nodash.txt b/i18npool/source/breakiterator/data/dict_word_nodash.txt deleted file mode 100644 index 279cc50e5b66..000000000000 --- a/i18npool/source/breakiterator/data/dict_word_nodash.txt +++ /dev/null @@ -1,147 +0,0 @@ -# -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. -# -# file: dict_word.txt -# -# ICU Word Break Rules -# See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 -# - - - -#################################################################################### -# -# Character class definitions from TR 29 -# -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] ]; - -$SufixLetter = [:name= FULL STOP:]; - - -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; - -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - - - -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### - -$Format = [[:Cf:] - $TheZWSP]; - - - -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# - - -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; - - -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; - -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; - -[[:P:][:S:]]*; - -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; - -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; - -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; - -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# - -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. -# -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; - -#!.*; -! ($NonStarters* | ) .; - diff --git a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt index fb29b478af21..b39503d1b405 100644 --- a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt +++ b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt @@ -1,157 +1,221 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: dict_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; +############################################################################## -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION -# list of dashes or hyphens that should be accepted as part of the word if a single one of these -# pre- or postfixes a word. E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to -# be part of the word in order to have it properly spell checked etc. -$PrePostDashHyphen = [ [:name = HYPHEN-MINUS:] ]; +!!chain; +!!quoted_literals_only; -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] - [:name = HYPHEN-MINUS:] ]; +# +# Character Class Definitions. +# -$SufixLetter = [:name= FULL STOP:]; - +$Han = [:Han:]; -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$ALetter = [\p{Word_Break = ALetter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; +### BEGIN CUSTOMIZATION +### Unknown issue number: Dictionary words can contain hyphens +### tdf#49885: Sync custom BreakIterator rules with ICU originals +### - ICU is now more permissive about punctuation inside words. +### - For compatibility, exclude certain characters that were previously excluded. -$TheZWSP = \u200b; +$IncludedML = [:name = HYPHEN-MINUS:]; +$ExcludedML = [[:name = COLON:] + [:name = GREEK ANO TELEIA:] + [:name = PRESENTATION FORM FOR VERTICAL COLON:] + [:name = SMALL COLON:] + [:name = FULLWIDTH COLON:]]; -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; +# $MidLetter = [\p{Word_Break = MidLetter}]; +$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML]; +### END CUSTOMIZATION +### BEGIN CUSTOMIZATION +### Unknown issue number: Allow leading and trailing hyphens in certain languages +### This part of the customization does not replace any rules. +$PrePostHyphen = [:name = HYPHEN-MINUS:]; -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### +### END CUSTOMIZATION -$Format = [[:Cf:] - $TheZWSP]; +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; +# TODO: check if handling of katakana in dictionary makes rules incorrect/void -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; +## ------------------------------------------------- + +# Rule 3 - CR x LF # -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; +$CR $LF; +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. # -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. +$ZWJ $Extended_Pict; + +# Rule 3d - Keep horizontal whitespace together. # -# At most one leading or trailing dash/hyphen should be accepted as well. -# E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to -# be part of the word in order to have it properly spell checked etc. -$LetterSequence = $PrePostDashHyphen? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)* $PrePostDashHyphen?; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; +$WSegSpace $WSegSpace; -[[:P:][:S:]]*; +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; +$ExFm = [$Extend $Format $ZWJ]; -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. + +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # # -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) +# rule 5 +# Do not break between most letters. # -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. +### BEGIN CUSTOMIZATION +### Unknown issue number: Allow leading and trailing hyphens in certain languages + +# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); +($PrePostHyphen) ? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen)?; + +### END CUSTOMIZATION + +# rule 6 and 7 + +### BEGIN CUSTOMIZATION +### Unknown issue number: Allow leading and trailing hyphens in certain languages + +# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; +($PrePostHyphen)? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen)? {200}; + +### END CUSTOMIZATION + +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; + +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; + +# rule 8 + +$Numeric $ExFm* $Numeric; + +# rule 9 + +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; + +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) + +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. # -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; +^$Regional_Indicator $ExFm* $Regional_Indicator; -#!.*; -! ($NonStarters* | ) .; +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found +# Rule 999 +# Match a single code point if no other rule applies. +.; diff --git a/i18npool/source/breakiterator/data/edit_word.txt b/i18npool/source/breakiterator/data/edit_word.txt index 92b344c19d41..14fc221aa96e 100644 --- a/i18npool/source/breakiterator/data/edit_word.txt +++ b/i18npool/source/breakiterator/data/edit_word.txt @@ -1,142 +1,199 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: edit_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]]; - -$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; +############################################################################## + +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION + +!!chain; +!!quoted_literals_only; + # # Character Class Definitions. -# The names are those from TR29. # -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; +$Han = [:Han:]; +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$ALetter = [\p{Word_Break = ALetter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidLetter = [\p{Word_Break = MidLetter}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; +### BEGIN CUSTOMIZATION +### i#13494: For the purposes of editing, standalone punctuation should be treated as a word. +### This change subtracts undesired characters from the above families -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### +# $MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNumLet = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]]; -$Format = [[:Cf:] - $TheZWSP]; +# $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]]; +### END CUSTOMIZATION +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; +# TODO: check if handling of katakana in dictionary makes rules incorrect/void -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200}; -# Punctuations by themselves -[[:P:][:S:]-[:name = FULL STOP:]]*; -[[:name = FULL STOP:]]*; +## ------------------------------------------------- +# Rule 3 - CR x LF # -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; +$CR $LF; +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. # -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; +$ZWJ $Extended_Pict; +# Rule 3d - Keep horizontal whitespace together. # -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; +$WSegSpace $WSegSpace; + +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. + +$ExFm = [$Extend $Format $ZWJ]; + +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. + +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. + +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # # -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) +# rule 5 +# Do not break between most letters. # +($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 6 and 7 +($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; + +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; + +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; + +# rule 8 + +$Numeric $ExFm* $Numeric; + +# rule 9 + +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) + +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. # -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format]; +^$Regional_Indicator $ExFm* $Regional_Indicator; -#!.*; -! ($NonStarters* | ) .; +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found + +### BEGIN CUSTOMIZATION +### i#13494: For the purposes of editing, standalone punctuation should be treated as a word. +### This customization does not replace any rules. +[[:P:][:S:]-[:name = FULL STOP:]]* +[[:name = FULL STOP:]]*; +### END CUSTOMIZATION +# Rule 999 +# Match a single code point if no other rule applies. +.; diff --git a/i18npool/source/breakiterator/data/edit_word_he.txt b/i18npool/source/breakiterator/data/edit_word_he.txt deleted file mode 100644 index 0b5908814e08..000000000000 --- a/i18npool/source/breakiterator/data/edit_word_he.txt +++ /dev/null @@ -1,142 +0,0 @@ -# -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. -# -# file: edit_word.txt -# -# ICU Word Break Rules -# See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 -# - - - -#################################################################################### -# -# Character class definitions from TR 29 -# -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]]; - -$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; - -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - - - -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### - -$Format = [[:Cf:] - $TheZWSP]; - - - -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# - - -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; - - -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; - -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200}; - -# Punctuations by themselves -[[:P:][:S:]-[:name = FULL STOP:]]*; -[[:name = FULL STOP:]]*; - -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; - -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; - -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; - -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# - -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. -# -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format]; - -#!.*; -! ($NonStarters* | ) .; - diff --git a/i18npool/source/breakiterator/data/edit_word_hu.txt b/i18npool/source/breakiterator/data/edit_word_hu.txt index 4a08acab0029..389ad2bacc13 100644 --- a/i18npool/source/breakiterator/data/edit_word_hu.txt +++ b/i18npool/source/breakiterator/data/edit_word_hu.txt @@ -1,159 +1,215 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: edit_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] - [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] - [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:] - [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] - [:name = DIGIT ZERO:] - [:name = DIGIT ONE:] - [:name = DIGIT TWO:] - [:name = DIGIT THREE:] - [:name = DIGIT FOUR:] - [:name = DIGIT FIVE:] - [:name = DIGIT SIX:] - [:name = DIGIT SEVEN:] - [:name = DIGIT EIGHT:] - [:name = DIGIT NINE:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] - [:name = HYPHEN-MINUS:] [:name = EURO SIGN:] [:name = PERCENT SIGN:] - [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] - [:name = EN DASH:] [:name = EM DASH:] - [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]]; - -$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]]; -e ... etc. - the rest is truncated
