source

Jonathan Clark (via logerrit) Fri, 29 Nov 2024 09:41:18 -0800

 i18npool/qa/cppunit/test_breakiterator.cxx                   |   45 +++++++++++
 i18npool/source/breakiterator/data/dict_word.txt             |   21 ++++-
 i18npool/source/breakiterator/data/dict_word_prepostdash.txt |    9 +-
 3 files changed, 71 insertions(+), 4 deletions(-)


New commits:
commit f4fe6df6aa92573368c3fa0edb9fd03e64d9d059
Author:     Jonathan Clark <[email protected]>
AuthorDate: Thu Nov 28 12:47:02 2024 -0700
Commit:     Jonathan Clark <[email protected]>
CommitDate: Fri Nov 29 18:40:51 2024 +0100

    tdf#162514 i18npool: Handle abbreviations in dictionary breakiterator
    
    Restores abbreviation handling to spell checking.
    
    Regression from commit 44699b3de37f07090ac6fee1cd97aa76036e9700
     "tdf#49885 BreakIterator rule upgrades".
    
    Change-Id: I2883f984952aa3e54cfe800590a16c0de74ae0e4
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/177506
    Reviewed-by: Jonathan Clark <[email protected]>
    Tested-by: Jenkins

diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx 
b/i18npool/qa/cppunit/test_breakiterator.cxx
index e56089ad0c28..9d9712e54e71 100644
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -48,6 +48,7 @@ public:
     void testLegacyDictWordPrepostDash_nds_DE();
     void testLegacyDictWordPrepostDash_nl_NL();
     void testLegacyDictWordPrepostDash_sv_SE();
+    void testDictWordAbbreviation();
     void testHebrewGereshGershaim();
     void testLegacySurrogatePairs();
     void testWordCount();
@@ -71,6 +72,7 @@ public:
     CPPUNIT_TEST(testLegacyDictWordPrepostDash_nds_DE);
     CPPUNIT_TEST(testLegacyDictWordPrepostDash_nl_NL);
     CPPUNIT_TEST(testLegacyDictWordPrepostDash_sv_SE);
+    CPPUNIT_TEST(testDictWordAbbreviation);
     CPPUNIT_TEST(testHebrewGereshGershaim);
     CPPUNIT_TEST(testLegacySurrogatePairs);
     CPPUNIT_TEST(testWordCount);
@@ -1666,6 +1668,49 @@ void 
TestBreakIterator::testLegacyDictWordPrepostDash_de_DE()
     }
 }
 
+void TestBreakIterator::testDictWordAbbreviation()
+{
+    std::vector<lang::Locale> aLocale{
+        { "en", "US", "" }, // dict_word locale
+        { "de", "DE", "" } // dict_word_prepostdash locale
+    };
+
+    for (const auto& rLocale : aLocale)
+    {
+        auto aTest = u"Examples: e.g. i.e. etc. and such"_ustr;
+
+        i18n::Boundary aBounds
+            = m_xBreak->getWordBoundary(aTest, 3, rLocale, 
i18n::WordType::DICTIONARY_WORD, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aTest, 10, rLocale, 
i18n::WordType::DICTIONARY_WORD, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aTest, 15, rLocale, 
i18n::WordType::DICTIONARY_WORD, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(15), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aTest, 20, rLocale, 
i18n::WordType::DICTIONARY_WORD, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aTest, 26, rLocale, 
i18n::WordType::DICTIONARY_WORD, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(28), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aTest, 30, rLocale, 
i18n::WordType::DICTIONARY_WORD, false);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(29), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(33), aBounds.endPos);
+    }
+}
+
 void TestBreakIterator::testLegacyDictWordPrepostDash_nds_DE()
 {
     lang::Locale aLocale;
diff --git a/i18npool/source/breakiterator/data/dict_word.txt 
b/i18npool/source/breakiterator/data/dict_word.txt
index 4a09af5cf1b2..849b2fe29205 100644
--- a/i18npool/source/breakiterator/data/dict_word.txt
+++ b/i18npool/source/breakiterator/data/dict_word.txt
@@ -70,6 +70,9 @@ $ExcludedML         = [[:name = COLON:]
                        [:name = SMALL COLON:]
                        [:name = FULLWIDTH COLON:]];
 
+### tdf#162514: For spell checking, abbreviations may end with a period.
+$PostPeriod         = [:name = FULL STOP:];
+
 # $MidLetter          = [\p{Word_Break = MidLetter}];
 $MidLetter          = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML];
 
@@ -140,10 +143,24 @@ $Ideographic $ExFm* {400};          #
 # rule 5
 #    Do not break between most letters.
 #
-($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+### BEGIN CUSTOMIZATION
+### tdf#162514: For spell checking, abbreviations may end with a period.
+
+# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
+($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter) 
($PostPeriod)?;
+
+### END CUSTOMIZATION
 
 # rule 6 and 7
-($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | 
$Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
+
+### BEGIN CUSTOMIZATION
+### tdf#162514: For spell checking, abbreviations may end with a period.
+
+# ($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | 
$Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
+($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | 
$Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PostPeriod)? {200};
+
+### END CUSTOMIZATION
 
 # rule 7a
 $Hebrew_Letter $ExFm* $Single_Quote {200};
diff --git a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt 
b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt
index b39503d1b405..6051c149d23f 100644
--- a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt
+++ b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt
@@ -82,6 +82,9 @@ $MidLetter          = [[\p{Word_Break = 
MidLetter}]-$ExcludedML $IncludedML];
 
 $PrePostHyphen      = [:name = HYPHEN-MINUS:];
 
+### tdf#162514: For spell checking, abbreviations may end with a period.
+$PostPeriod         = [:name = FULL STOP:];
+
 ### END CUSTOMIZATION
 
 $Hiragana           = [:Hiragana:];
@@ -148,9 +151,10 @@ $Ideographic $ExFm* {400};          #
 
 ### BEGIN CUSTOMIZATION
 ### Unknown issue number: Allow leading and trailing hyphens in certain 
languages
+### tdf#162514: For spell checking, abbreviations may end with a period.
 
 # ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
-($PrePostHyphen) ? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | 
$Hebrew_Letter) ($PrePostHyphen)?;
+($PrePostHyphen) ? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | 
$Hebrew_Letter) ($PrePostHyphen | $PostPeriod)?;
 
 ### END CUSTOMIZATION
 
@@ -158,9 +162,10 @@ $Ideographic $ExFm* {400};          #
 
 ### BEGIN CUSTOMIZATION
 ### Unknown issue number: Allow leading and trailing hyphens in certain 
languages
+### tdf#162514: For spell checking, abbreviations may end with a period.
 
 # ($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | 
$Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
-($PrePostHyphen)? ($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | 
$MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) 
($PrePostHyphen)? {200};
+($PrePostHyphen)? ($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | 
$MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) 
($PrePostHyphen | $PostPeriod)? {200};
 
 ### END CUSTOMIZATION

core.git: i18npool/qa i18npool/source

Reply via email to