source

László Németh (via logerrit) Sun, 31 Dec 2023 15:43:00 -0800

 include/linguistic/lngprophelp.hxx                    |    8 
 lingucomponent/source/hyphenator/hyphen/hyphenimp.cxx |  181 ++++++++++++++++++
 linguistic/source/lngprophelp.cxx                     |   15 +
 3 files changed, 202 insertions(+), 2 deletions(-)


New commits:
commit c899d3608d30f3ab4c2bc193c1fcd765221614a4
Author:     László Németh <nem...@numbertext.org>
AuthorDate: Sun Dec 31 14:30:05 2023 +0100
Commit:     László Németh <nem...@numbertext.org>
CommitDate: Mon Jan 1 00:42:35 2024 +0100

    tdf#158885 sw: don't hyphenate right after a stem boundary
    
    in compound words to get better typography or orthography
    with more readable text, if hyphenation zone is enabled.
    
    If there are multiple possible break points in the word
    according to the libhyphen based hyphenation, keep only
    the best ones using Hunspell morphological data
    based on compound word decomposition of non-dictionary
    words (pa: fields), and extra morphological data of dictionary
    words (hy: fields) or their combination.
    
    For readability and tradition, orthography and typography
    prefer or only allow hyphenation between stems in compound
    words in several languages, like Danish, Dutch, German,
    Hungarian, Norwegian and Sweden.
    
    Hyphenation zone is to avoid of too much or bad hyphenation.
    Preferring stem boundaries for hyphenation within the hyphenation
    zone is a natural extension of  it, i.e. skip hyphenation within
    stems, if there is stem boundary within the hyphenation zone.
    
    Now skip break points after skip boundaries, if their
    distance is 3 or less characters (COMPOUNDLEFTHYPHENMIN = 4).
    
    Skip also break points on stem boundaries, if there is a
    weighted stem boundary before them within 3 characters.
    
    Weighted stem boundaries are there between the pa: fields,
    (stems resulted by the compound word decomposition),
    or in hy: field, boundaries marked by double || instead of
    a single |.
    
    More information: man (5) hunspell, and option -m hunspell.
    
    Note: break points skipped only in the last stems for languages
    with fogemorphemes, yet, because of their incomplete Hunspell
    output for morphological analysis.
    
    Change-Id: I739908716d11a9c2db0c9d36fba8657ba6f53bee
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/161498
    Tested-by: Jenkins
    Reviewed-by: László Németh <nem...@numbertext.org>

diff --git a/include/linguistic/lngprophelp.hxx 
b/include/linguistic/lngprophelp.hxx
index 85b32b7cec3e..a3ad3d2665ac 100644
--- a/include/linguistic/lngprophelp.hxx
+++ b/include/linguistic/lngprophelp.hxx
@@ -242,13 +242,15 @@ class PropertyHelper_Hyphen final :
     // default values
     sal_Int16   nHyphMinLeading,
             nHyphMinTrailing,
-            nHyphMinWordLength;
+            nHyphMinWordLength,
+            nHyphTextHyphenZone;
     bool bNoHyphenateCaps;
 
     // return values, will be set to default value or current temporary value
     sal_Int16   nResHyphMinLeading,
             nResHyphMinTrailing,
-            nResHyphMinWordLength;
+            nResHyphMinWordLength,
+            nResHyphTextHyphenZone;
     bool bResNoHyphenateCaps;
 
     PropertyHelper_Hyphen( const PropertyHelper_Hyphen & ) = delete;
@@ -275,6 +277,7 @@ public:
     sal_Int16   GetMinLeading() const               { return 
nResHyphMinLeading; }
     sal_Int16   GetMinTrailing() const              { return 
nResHyphMinTrailing; }
     sal_Int16   GetMinWordLength() const            { return 
nResHyphMinWordLength; }
+    sal_Int16   GetTextHyphenZone() const           { return 
nResHyphTextHyphenZone; }
     bool IsNoHyphenateCaps() const { return bResNoHyphenateCaps; }
 };
 
@@ -298,6 +301,7 @@ public:
     sal_Int16   GetMinLeading() const;
     sal_Int16   GetMinTrailing() const;
     sal_Int16   GetMinWordLength() const;
+    sal_Int16   GetTextHyphenZone() const;
     bool IsNoHyphenateCaps() const;
     /// @throws css::uno::RuntimeException
     bool addLinguServiceEventListener(
diff --git a/lingucomponent/source/hyphenator/hyphen/hyphenimp.cxx 
b/lingucomponent/source/hyphenator/hyphen/hyphenimp.cxx
index 8ac156ef8cb3..cb66c585e9fa 100644
--- a/lingucomponent/source/hyphenator/hyphen/hyphenimp.cxx
+++ b/lingucomponent/source/hyphenator/hyphen/hyphenimp.cxx
@@ -20,10 +20,13 @@
 #include <com/sun/star/uno/Reference.h>
 
 #include <comphelper/sequence.hxx>
+#include <comphelper/processfactory.hxx>
 #include <cppuhelper/factory.hxx>
 #include <cppuhelper/supportsservice.hxx>
 #include <cppuhelper/weak.hxx>
 #include <com/sun/star/linguistic2/XLinguProperties.hpp>
+#include <com/sun/star/linguistic2/LinguServiceManager.hpp>
+#include <com/sun/star/linguistic2/XSpellChecker1.hpp>
 #include <i18nlangtag/languagetag.hxx>
 #include <tools/debug.hxx>
 #include <osl/mutex.hxx>
@@ -53,6 +56,10 @@
 #include <vector>
 #include <set>
 #include <memory>
+#include <o3tl/string_view.hxx>
+
+// XML-header to query SPELLML support
+constexpr OUStringLiteral SPELLML_SUPPORT = u"<?xml?>";
 
 using namespace utl;
 using namespace osl;
@@ -63,6 +70,13 @@ using namespace com::sun::star::uno;
 using namespace com::sun::star::linguistic2;
 using namespace linguistic;
 
+static uno::Reference< XLinguServiceManager2 > GetLngSvcMgr_Impl()
+{
+    uno::Reference< XComponentContext > xContext( 
comphelper::getProcessComponentContext() );
+    uno::Reference< XLinguServiceManager2 > xRes = 
LinguServiceManager::create( xContext ) ;
+    return xRes;
+}
+
 Hyphenator::Hyphenator() :
     aEvtListeners   ( GetLinguMutex() )
 {
@@ -251,6 +265,7 @@ Reference< XHyphenatedWord > SAL_CALL 
Hyphenator::hyphenate( const OUString& aWo
     sal_Int16 minTrail = rHelper.GetMinTrailing();
     sal_Int16 minLead = rHelper.GetMinLeading();
     sal_Int16 minLen = rHelper.GetMinWordLength();
+    sal_Int16 nHyphZone = rHelper.GetTextHyphenZone();
     bool bNoHyphenateCaps = rHelper.IsNoHyphenateCaps();
 
     rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
@@ -364,6 +379,16 @@ Reference< XHyphenatedWord > SAL_CALL 
Hyphenator::hyphenate( const OUString& aWo
 
         sal_Int32 Leading =  GetPosInWordToCheck( aWord, nMaxLeading );
 
+        // use morphological analysis of Hunspell to get better hyphenation of 
compound words
+        // optionally when hyphenation zone is enabled
+        // pa: fields contain stems resulted by compound word analysis of 
non-dictionary words
+        // hy: fields contain hyphenation data of dictionary (compound) words
+        Reference< XSpellAlternatives > xTmpRes;
+        bool bAnalyzed = false; // enough the analyse once the word
+        bool bCompoundHyphenation = true; // try to hyphenate compound words 
better
+        OUString sStems; // processed result of the compound word analysis, 
e.g. com|pound|word
+        sal_Int32 nSuffixLen = 0; // do not remove break points in suffixes
+
         for (sal_Int32 i = 0; i < n; i++)
         {
             int leftrep = 0;
@@ -393,6 +418,162 @@ Reference< XHyphenatedWord > SAL_CALL 
Hyphenator::hyphenate( const OUString& aWo
             }
             if (hit)
             {
+                // skip hyphenation right after stem boundaries in compound 
words
+                // if hyphenation zone is enabled (default value: less than 
4-character distance)
+                if ( bCompoundHyphenation && nHyphZone && nHyphenationPos > -1 
&& i - nHyphenationPos < 4 )
+                {
+                    uno::Reference< XLinguServiceManager2 > xLngSvcMgr( 
GetLngSvcMgr_Impl() );
+                    uno::Reference< XSpellChecker1 > xSpell;
+
+                    LanguageType nLanguage = LinguLocaleToLanguage( aLocale );
+
+                    xSpell.set( xLngSvcMgr->getSpellChecker(), UNO_QUERY );
+
+                    // get morphological analysis of the word
+                    if ( ( bAnalyzed && xTmpRes.is() ) || ( xSpell.is() && 
xSpell->isValid(
+                            SPELLML_SUPPORT, 
static_cast<sal_uInt16>(nLanguage),
+                            uno::Sequence< beans::PropertyValue >() ) ) )
+                    {
+                        if ( !bAnalyzed )
+                        {
+                            xTmpRes = xSpell->spell( "<?xml?><query 
type='analyze'><word>" +
+                                                       aWord + 
"</word></query>",
+                                               
static_cast<sal_uInt16>(nLanguage),
+                                               uno::Sequence< 
beans::PropertyValue >() );
+                            bAnalyzed = true;
+
+                            if (xTmpRes.is())
+                            {
+                                Sequence<OUString>seq = 
xTmpRes->getAlternatives();
+                                if (seq.hasElements())
+                                {
+                                    sal_Int32 nEndOfFirstAnalysis = 
seq[0].indexOf("</a>");
+                                    // FIXME use only the first analysis
+                                    OUString morph(
+                                            seq[0].copy(0, 
nEndOfFirstAnalysis));
+
+                                    // concatenate pa: fields, i.e. stems in 
the analysis:
+                                    // pa:stem1 pa:stem2 pa:stem3 -> 
stem1||stem2||stem3
+                                    sal_Int32 nPa = -1;
+                                    while ( (nPa = morph.indexOf(u" pa:", nPa 
+ 1)) > -1 )
+                                    {
+                                        // use hy: field of the actual stem, 
if it exists
+                                        // pa:stem1 hy:st|em1 pa:stem2 -> 
st|em1||stem2
+                                        sal_Int32 nHy = morph.indexOf(u" hy:", 
nPa + 3);
+                                        sal_Int32 nPa2 = morph.indexOf(u" 
pa:", nPa + 3);
+
+                                        if ( nHy > -1 && ( nPa2 == -1 || nHy < 
nPa2 ) )
+                                        {
+                                            OUString sStems2(morph.getToken(1, 
' ', nHy).copy(3));
+                                            if ( sStems2.indexOf('|') > -1 )
+                                                sStems += sStems2+ u"||";
+                                            else if ( sal_Int32 nBreak = 
o3tl::toInt32(sStems2) )
+                                            {
+                                                OUString sPa(morph.getToken(1, 
' ', nPa).copy(3));
+                                                if ( nBreak < sPa.getLength() )
+                                                    sStems += 
OUString::Concat(sPa.subView(0, nBreak)) + u"|" +
+                                                           sPa.subView(nBreak);
+                                            }
+                                        }
+                                        else
+                                        {
+                                            OUString sPa(morph.getToken(1, ' 
', nPa).copy(3));
+
+                                            // handle special case: missing 
pa: in morphological analysis
+                                            // before in-word suffixes 
(German, Sweden etc. dictionaries)
+                                            // (recognized by the single last 
pa:)
+                                            if (sStems.isEmpty() && nPa2 == -1 
&& aWord.endsWith(sPa))
+                                            {
+                                                sStems = 
OUString::Concat(aWord.subView(0, aWord.getLength() -
+                                                             sPa.getLength())) 
+ u"||" +
+                                                         
aWord.subView(aWord.getLength() -
+                                                             sPa.getLength());
+                                                break;
+                                            }
+
+                                            sStems += sPa + "||";
+
+                                            // count suffix length
+                                            sal_Int32 nSt = 
morph.lastIndexOf(" st:");
+                                            if ( nSt > -1 )
+                                            {
+                                                sal_Int32 nStemLen =
+                                                    o3tl::getToken(morph, 1, ' 
', nSt).length() - 3;
+                                                if ( nStemLen < 
sPa.getLength() )
+                                                    nSuffixLen = 
sPa.getLength() - nStemLen;
+                                            }
+                                        }
+
+                                        if ( nPa == -1 ) // getToken() can 
modify nPa
+                                            break;
+                                    }
+
+                                    // only hy:, but not pa:
+                                    if ( sStems.isEmpty() )
+                                    {
+                                        // check hy: (pre-defined hyphenation)
+                                        sal_Int32 nHy = morph.indexOf(" hy:");
+                                        if (nHy > -1)
+                                        {
+                                            sStems = morph.getToken(1, ' ', 
nHy).copy(3);
+                                            if ( sStems.indexOf('|') == -1 && 
sStems.indexOf('-') == -1 )
+                                            {
+                                                if ( sal_Int32 nBreak = 
o3tl::toInt32(sStems) )
+                                                {
+                                                    if ( nBreak < 
aWord.getLength() )
+                                                        sStems += 
OUString::Concat(aWord.subView(0, nBreak)) + u"|" +
+                                                               
aWord.subView(nBreak);
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        // handle string separated by |, e.g "program 
hy:pro|gram"
+                        if ( sStems.indexOf('|') > -1 )
+                        {
+                            sal_Int32 nLetters = 0; // count not separator 
characters
+                            sal_Int32 nSepPos = -1; // position of last 
character | used for stem boundaries
+                            bool bWeightedSep = false; // double separator || 
= weighted stem boundary
+                            sal_Int32 j = 0;
+                            for (; j < sStems.getLength() && nLetters <= i; 
j++)
+                            {
+                                if ( sStems[j] == '|' )
+                                {
+                                    bWeightedSep = nSepPos > -1 && (j - 1 == 
nSepPos);
+                                    nSepPos = j;
+                                }
+                                else if ( sStems[j] != '-' && sStems[j] != '=' 
&& sStems[j] != '*' )
+                                    ++nLetters;
+                            }
+                            // skip break points near stem boundaries
+                            if (
+                                // there is a stem boundary before the actual 
break point
+                                nSepPos > -1 &&
+                                // and the break point is within a stem, i.e. 
not in the
+                                // suffix of the last stem
+                                i < aWord.getLength() - nSuffixLen - 1 &&
+                                // and it is not another stem boundary
+                                j + 1 < sStems.getLength() &&
+                                ( sStems[j + 1] != u'|' ||
+                                // except if it's only the previous was a 
weighted one
+                                    ( bWeightedSep && ( j + 2 == 
sStems.getLength() ||
+                                                        sStems[j + 2] != u'|' 
) ) ) )
+                            {
+                                continue;
+                            }
+                        }
+                        else
+                            // not a compound word
+                            bCompoundHyphenation = false;
+                    }
+                    else
+                        // no SPELLML support, no morphological analysis
+                        bCompoundHyphenation = false;
+                }
+
                 nHyphenationPos = i;
                 if (rep && rep[i])
                 {
diff --git a/linguistic/source/lngprophelp.cxx 
b/linguistic/source/lngprophelp.cxx
index 57483d062e55..ee593f9f489f 100644
--- a/linguistic/source/lngprophelp.cxx
+++ b/linguistic/source/lngprophelp.cxx
@@ -508,6 +508,7 @@ void PropertyHelper_Hyphen::SetDefaultValues()
     nResHyphMinLeading      = nHyphMinLeading       = 2;
     nResHyphMinTrailing     = nHyphMinTrailing      = 2;
     nResHyphMinWordLength   = nHyphMinWordLength    = 0;
+    nResHyphTextHyphenZone  = nHyphTextHyphenZone   = 0;
     bResNoHyphenateCaps = bNoHyphenateCaps = false;
 }
 
@@ -542,6 +543,11 @@ void PropertyHelper_Hyphen::GetCurrentValues()
             pnVal    = &nHyphMinWordLength;
             pnResVal = &nResHyphMinWordLength;
         }
+        else if ( rPropName == UPN_HYPH_ZONE )
+        {
+            pnVal    = &nHyphTextHyphenZone;
+            pnResVal = &nResHyphTextHyphenZone;
+        }
         else if ( rPropName == UPN_HYPH_NO_CAPS )
         {
             pbVal    = &bNoHyphenateCaps;
@@ -575,6 +581,7 @@ bool PropertyHelper_Hyphen::propertyChange_Impl( const 
PropertyChangeEvent& rEvt
             case UPH_HYPH_MIN_LEADING     : pnVal = &nHyphMinLeading; break;
             case UPH_HYPH_MIN_TRAILING    : pnVal = &nHyphMinTrailing; break;
             case UPH_HYPH_MIN_WORD_LENGTH : pnVal = &nHyphMinWordLength; break;
+            case UPH_HYPH_ZONE            : pnVal = &nHyphTextHyphenZone; 
break;
             case UPH_HYPH_NO_CAPS : pbVal = &bNoHyphenateCaps; break;
             default:
                 SAL_WARN( "linguistic", "unknown property handle " << 
rEvt.PropertyHandle << " (check in include/unotools/linguprops.hxx)");
@@ -613,6 +620,7 @@ void PropertyHelper_Hyphen::SetTmpPropVals( const 
PropertyValues &rPropVals )
     nResHyphMinLeading      = nHyphMinLeading;
     nResHyphMinTrailing     = nHyphMinTrailing;
     nResHyphMinWordLength   = nHyphMinWordLength;
+    nResHyphTextHyphenZone  = nHyphTextHyphenZone;
     bResNoHyphenateCaps = bNoHyphenateCaps;
 
     for (const PropertyValue& rVal : rPropVals)
@@ -626,6 +634,8 @@ void PropertyHelper_Hyphen::SetTmpPropVals( const 
PropertyValues &rPropVals )
             pnResVal = &nResHyphMinTrailing;
         else if ( rVal.Name == UPN_HYPH_MIN_WORD_LENGTH )
             pnResVal = &nResHyphMinWordLength;
+        else if ( rVal.Name == UPN_HYPH_ZONE )
+            pnResVal = &nResHyphTextHyphenZone;
         else if ( rVal.Name == UPN_HYPH_NO_CAPS )
             pbResVal = &bResNoHyphenateCaps;
 
@@ -705,6 +715,11 @@ sal_Int16 PropertyHelper_Hyphenation::GetMinWordLength() 
const
     return mxPropHelper->GetMinWordLength();
 }
 
+sal_Int16 PropertyHelper_Hyphenation::GetTextHyphenZone() const
+{
+    return mxPropHelper->GetTextHyphenZone();
+}
+
 bool PropertyHelper_Hyphenation::IsNoHyphenateCaps() const
 {
     return mxPropHelper->IsNoHyphenateCaps();

core.git: include/linguistic lingucomponent/source linguistic/source

Reply via email to