i18nutil

Khaled Hosny (via logerrit) Mon, 24 Jul 2023 17:15:22 -0700

 editeng/source/editeng/impedit2.cxx |   22 ++++++++++++----------
 i18nutil/source/utility/unicode.cxx |   12 +++++++-----
 include/i18nutil/unicode.hxx        |   10 +++++-----
 3 files changed, 24 insertions(+), 20 deletions(-)


New commits:
commit 529dce3d5b695637a1ccc8b4b697d87c5db0d3a2
Author:     Khaled Hosny <kha...@libreoffice.org>
AuthorDate: Mon Jul 24 22:07:25 2023 +0300
Commit:     خالد حسني <kha...@libreoffice.org>
CommitDate: Tue Jul 25 02:15:05 2023 +0200

    editeng: Call unicode::getUnicodeType() on UTF-32 code units
    
    By using OUString::iterateCodePoints() instead of accessing individual
    UTF-16 code units that might be surrogate pairs.
    
    Change-Id: I5e3e513a788f0c939f96b0521fed16fe4848a053
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/154875
    Tested-by: Jenkins
    Reviewed-by: خالد حسني <kha...@libreoffice.org>

diff --git a/editeng/source/editeng/impedit2.cxx 
b/editeng/source/editeng/impedit2.cxx
index 664c8f5d18f0..c976ebf93cb2 100644
--- a/editeng/source/editeng/impedit2.cxx
+++ b/editeng/source/editeng/impedit2.cxx
@@ -1732,10 +1732,10 @@ void ImpEditEngine::InitScriptTypes( sal_Int32 nPara )
         {
             if ( _xBI->getScriptType( aText, nPos - 1 ) == 
i18n::ScriptType::WEAK )
             {
-                switch ( u_charType(aText.iterateCodePoints(&nPos, 0) ) ) {
-                case U_NON_SPACING_MARK:
-                case U_ENCLOSING_MARK:
-                case U_COMBINING_SPACING_MARK:
+                switch (unicode::getUnicodeType(aText.iterateCodePoints(&nPos, 
0))) {
+                case css::i18n::UnicodeType::NON_SPACING_MARK:
+                case css::i18n::UnicodeType::ENCLOSING_MARK:
+                case css::i18n::UnicodeType::COMBINING_SPACING_MARK:
                     --nPos;
                     rTypes.back().nEndPos--;
                     break;
@@ -2761,7 +2761,9 @@ EditPaM ImpEditEngine::ImpInsertText(const EditSelection& 
aCurSel, const OUStrin
                 sal_Int32 nPos = nMaxNewChars;
                 while (nPos-- > 0 && (nMaxNewChars - nPos) <= 84)
                 {
-                    switch (unicode::getUnicodeType(aLine[nPos]))
+                    auto nNextPos = nPos;
+                    const auto c = aLine.iterateCodePoints(&nNextPos);
+                    switch (unicode::getUnicodeType(c))
                     {
                         case css::i18n::UnicodeType::UPPERCASE_LETTER:
                         case css::i18n::UnicodeType::LOWERCASE_LETTER:
@@ -2775,24 +2777,24 @@ EditPaM ImpEditEngine::ImpInsertText(const 
EditSelection& aCurSel, const OUStrin
                         break;
                         default:
                             {
-                                const sal_Unicode c = aLine[nPos];
                                 // Ignore NO-BREAK spaces, NBSP, NNBSP, ZWNBSP.
                                 if (c == 0x00A0 || c == 0x202F || c == 0xFEFF)
                                     break;
-                                if (c == '-' && nPos + 1 < nMaxNewChars)
+                                const auto n = 
aLine.iterateCodePoints(&nNextPos, 0);
+                                if (c == '-' && nNextPos < nMaxNewChars)
                                 {
                                     // Keep HYPHEN-MINUS with a number to the 
right.
-                                    const sal_Int16 t = 
unicode::getUnicodeType(aLine[nPos+1]);
+                                    const sal_Int16 t = 
unicode::getUnicodeType(n);
                                     if (    t == 
css::i18n::UnicodeType::DECIMAL_DIGIT_NUMBER ||
                                             t == 
css::i18n::UnicodeType::LETTER_NUMBER ||
                                             t == 
css::i18n::UnicodeType::OTHER_NUMBER)
                                         nMaxNewChars = nPos;        // line 
break before
                                     else
-                                        nMaxNewChars = nPos + 1;    // line 
break after
+                                        nMaxNewChars = nNextPos;    // line 
break after
                                 }
                                 else
                                 {
-                                    nMaxNewChars = nPos + 1;        // line 
break after
+                                    nMaxNewChars = nNextPos;        // line 
break after
                                 }
                                 nPos = 0;   // will break loop
                             }
commit 2e6e40b7453e2005d46ba7866feff2f2caa1f100
Author:     Khaled Hosny <kha...@libreoffice.org>
AuthorDate: Mon Jul 24 20:52:44 2023 +0300
Commit:     خالد حسني <kha...@libreoffice.org>
CommitDate: Tue Jul 25 02:14:56 2023 +0200

    i18nutil: Make unicode::getUnicodeType() take a UTF-32 code point
    
    Still need to fix call sites to handles surrogate pairs.
    
    Change-Id: I3ba896714fc6a90596c041148a3c9d965f60f4a1
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/154874
    Tested-by: Jenkins
    Reviewed-by: خالد حسني <kha...@libreoffice.org>

diff --git a/i18nutil/source/utility/unicode.cxx 
b/i18nutil/source/utility/unicode.cxx
index dbb81a8240f1..ae7b4c512aca 100644
--- a/i18nutil/source/utility/unicode.cxx
+++ b/i18nutil/source/utility/unicode.cxx
@@ -67,9 +67,10 @@ unicode::getUnicodeScriptEnd( UnicodeScript type) {
 }
 
 sal_Int16
-unicode::getUnicodeType( const sal_Unicode ch ) {
-    static sal_Unicode c = 0x00;
-    static sal_Int16 r = 0x00;
+unicode::getUnicodeType(const sal_uInt32 ch)
+{
+    static sal_uInt32 c = 0x00;
+    static sal_uInt32 r = 0x00;
 
     if (ch == c) return r;
     else c = ch;
@@ -213,7 +214,7 @@ sal_uInt32 unicode::GetMirroredChar(sal_uInt32 nChar) {
             bit(UnicodeType::PARAGRAPH_SEPARATOR)
 
 #define IsType(func, mask)  \
-bool func( const sal_Unicode ch) {\
+bool func( const sal_uInt32 ch) {\
     return (bit(getUnicodeType(ch)) & (mask)) != 0;\
 }
 
@@ -224,7 +225,8 @@ IsType(unicode::isSpace, SPACEMASK)
 #define CONTROLSPACE    bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
             bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
 
-bool unicode::isWhiteSpace( const sal_Unicode ch) {
+bool unicode::isWhiteSpace(const sal_uInt32 ch)
+{
     return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & 
(CONTROLSPACE)));
 }
 
diff --git a/include/i18nutil/unicode.hxx b/include/i18nutil/unicode.hxx
index 69e3e9d6e267..be08595e0b10 100644
--- a/include/i18nutil/unicode.hxx
+++ b/include/i18nutil/unicode.hxx
@@ -38,17 +38,17 @@ struct ScriptTypeList
 class I18NUTIL_DLLPUBLIC unicode
 {
 public:
-    static sal_Int16 getUnicodeType(const sal_Unicode ch);
+    static sal_Int16 getUnicodeType(const sal_uInt32 ch);
     static sal_Int16 getUnicodeScriptType(const sal_Unicode ch, const 
ScriptTypeList* typeList,
                                           sal_Int16 unknownType = 0);
     static sal_Unicode getUnicodeScriptStart(css::i18n::UnicodeScript type);
     static sal_Unicode getUnicodeScriptEnd(css::i18n::UnicodeScript type);
     static sal_uInt8 getUnicodeDirection(const sal_Unicode ch);
     static sal_uInt32 GetMirroredChar(sal_uInt32);
-    static bool isControl(const sal_Unicode ch);
-    static bool isAlpha(const sal_Unicode ch);
-    static bool isSpace(const sal_Unicode ch);
-    static bool isWhiteSpace(const sal_Unicode ch);
+    static bool isControl(const sal_uInt32 ch);
+    static bool isAlpha(const sal_uInt32 ch);
+    static bool isSpace(const sal_uInt32 ch);
+    static bool isWhiteSpace(const sal_uInt32 ch);
 
     /** Check for Unicode variation sequence selectors

[Libreoffice-commits] core.git: 2 commits - editeng/source i18nutil/source include/i18nutil

Reply via email to