Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package ghc-doclayout for openSUSE:Factory checked in at 2021-11-11 21:36:26 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/ghc-doclayout (Old) and /work/SRC/openSUSE:Factory/.ghc-doclayout.new.1890 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "ghc-doclayout" Thu Nov 11 21:36:26 2021 rev:7 rq:930324 version:0.3.1.1 Changes: -------- --- /work/SRC/openSUSE:Factory/ghc-doclayout/ghc-doclayout.changes 2021-03-24 16:15:58.704129520 +0100 +++ /work/SRC/openSUSE:Factory/.ghc-doclayout.new.1890/ghc-doclayout.changes 2021-11-11 21:36:39.528899014 +0100 @@ -1,0 +2,21 @@ +Tue Oct 12 10:51:12 UTC 2021 - [email protected] + +- Update doclayout to version 0.3.1.1. + ## 0.3.1.1 + + * Fix the end of the block of zero width characters which contains + the zero-width joiners and directional markings (Stephen Morgan, #5). + This fixes a regression introduced in 0.3.1, affecting code + points 0x2010 to 0x2030. + + ## 0.3.1 + + * Improved handling of emojis. Emojis are double-wide, but + previously this library did not treat them as such. We now + have comprehensive support of emojis, including variation + modifiers and zero-width joiners, verified by a test suite. + Performance has been confirmed to be no worse for text without emojis. + (Stephen Morgan, #1). API changes: export `realLengthNoShortcut`, + `isEmojiModifier`, `isEmojiVariation`, `isEmojiJoiner`. + +------------------------------------------------------------------- Old: ---- doclayout-0.3.0.2.tar.gz New: ---- doclayout-0.3.1.1.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ ghc-doclayout.spec ++++++ --- /var/tmp/diff_new_pack.pyhXqp/_old 2021-11-11 21:36:39.984899347 +0100 +++ /var/tmp/diff_new_pack.pyhXqp/_new 2021-11-11 21:36:39.988899350 +0100 @@ -19,13 +19,15 @@ %global pkg_name doclayout %bcond_with tests Name: ghc-%{pkg_name} -Version: 0.3.0.2 +Version: 0.3.1.1 Release: 0 Summary: A prettyprinting library for laying out text documents License: BSD-3-Clause URL: https://hackage.haskell.org/package/%{pkg_name} Source0: https://hackage.haskell.org/package/%{pkg_name}-%{version}/%{pkg_name}-%{version}.tar.gz BuildRequires: ghc-Cabal-devel +BuildRequires: ghc-containers-devel +BuildRequires: ghc-emojis-devel BuildRequires: ghc-mtl-devel BuildRequires: ghc-rpm-macros BuildRequires: ghc-safe-devel @@ -35,6 +37,7 @@ BuildRequires: ghc-tasty-devel BuildRequires: ghc-tasty-golden-devel BuildRequires: ghc-tasty-hunit-devel +BuildRequires: ghc-tasty-quickcheck-devel %endif %description ++++++ doclayout-0.3.0.2.tar.gz -> doclayout-0.3.1.1.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/doclayout-0.3.0.2/changelog.md new/doclayout-0.3.1.1/changelog.md --- old/doclayout-0.3.0.2/changelog.md 2021-03-15 22:20:36.000000000 +0100 +++ new/doclayout-0.3.1.1/changelog.md 2021-10-12 05:54:17.000000000 +0200 @@ -1,5 +1,22 @@ # doclayout +## 0.3.1.1 + + * Fix the end of the block of zero width characters which contains + the zero-width joiners and directional markings (Stephen Morgan, #5). + This fixes a regression introduced in 0.3.1, affecting code + points 0x2010 to 0x2030. + +## 0.3.1 + + * Improved handling of emojis. Emojis are double-wide, but + previously this library did not treat them as such. We now + have comprehensive support of emojis, including variation + modifiers and zero-width joiners, verified by a test suite. + Performance has been confirmed to be no worse for text without emojis. + (Stephen Morgan, #1). API changes: export `realLengthNoShortcut`, + `isEmojiModifier`, `isEmojiVariation`, `isEmojiJoiner`. + ## 0.3.0.2 * NOINLINE `literal` instead of `fromString` (#2, sjakobi). diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/doclayout-0.3.0.2/doclayout.cabal new/doclayout-0.3.1.1/doclayout.cabal --- old/doclayout-0.3.0.2/doclayout.cabal 2021-03-15 22:19:41.000000000 +0100 +++ new/doclayout-0.3.1.1/doclayout.cabal 2021-10-12 05:51:56.000000000 +0200 @@ -1,5 +1,5 @@ name: doclayout -version: 0.3.0.2 +version: 0.3.1.1 synopsis: A prettyprinting library for laying out text documents. description: doclayout is a prettyprinting library for laying out text documents, with several features not present @@ -23,6 +23,8 @@ exposed-modules: Text.DocLayout build-depends: base >= 4.9 && < 5, text, + containers, + emojis >=0.1.2, mtl, safe if !impl(ghc >= 8.0) @@ -40,7 +42,9 @@ tasty, tasty-golden, tasty-hunit, - text + tasty-quickcheck, + text, + emojis >=0.1.2 ghc-options: -threaded -rtsopts -with-rtsopts=-N default-language: Haskell2010 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/doclayout-0.3.0.2/src/Text/DocLayout.hs new/doclayout-0.3.1.1/src/Text/DocLayout.hs --- old/doclayout-0.3.0.2/src/Text/DocLayout.hs 2021-03-15 22:18:46.000000000 +0100 +++ new/doclayout-0.3.1.1/src/Text/DocLayout.hs 2021-10-12 05:49:36.000000000 +0200 @@ -69,6 +69,10 @@ , height , charWidth , realLength + , realLengthNoShortcut + , isEmojiModifier + , isEmojiVariation + , isEmojiJoiner -- * Types , Doc(..) , HasChars(..) @@ -76,14 +80,16 @@ where import Prelude -import Data.List (foldl') import Data.Maybe (fromMaybe) +import Data.Monoid (Sum(..)) import Safe (lastMay, initSafe) import Control.Monad import Control.Monad.State.Strict import GHC.Generics -import Data.Char (isSpace) -import Data.List (intersperse) +import Data.Char (isDigit, isSpace, ord) +import Data.List (foldl', intersperse) +import Data.List.NonEmpty (NonEmpty(..)) +import qualified Data.IntMap.Strict as IM import Data.Data (Data, Typeable) import Data.String import qualified Data.Text as T @@ -93,6 +99,7 @@ #else import Data.Semigroup #endif +import Text.Emoji (baseEmojis) -- | Class abstracting over various string types that -- can fold over characters. Minimal definition is 'foldrChar' @@ -676,58 +683,212 @@ -- | Returns width of a character in a monospace font: 0 for a combining -- character, 1 for a regular character, 2 for an East Asian wide character. charWidth :: Char -> Int -charWidth c = - case c of - _ | c < '\x0300' -> 1 - | c >= '\x0300' && c <= '\x036F' -> 0 -- combining - | c >= '\x0370' && c <= '\x10FC' -> 1 - | c >= '\x1100' && c <= '\x115F' -> 2 - | c >= '\x1160' && c <= '\x11A2' -> 1 - | c >= '\x11A3' && c <= '\x11A7' -> 2 - | c >= '\x11A8' && c <= '\x11F9' -> 1 - | c >= '\x11FA' && c <= '\x11FF' -> 2 - | c >= '\x1200' && c <= '\x2328' -> 1 - | c >= '\x2329' && c <= '\x232A' -> 2 - | c >= '\x232B' && c <= '\x2E31' -> 1 - | c >= '\x2E80' && c <= '\x303E' -> 2 - | c == '\x303F' -> 1 - | c >= '\x3041' && c <= '\x3247' -> 2 - | c >= '\x3248' && c <= '\x324F' -> 1 -- ambiguous - | c >= '\x3250' && c <= '\x4DBF' -> 2 - | c >= '\x4DC0' && c <= '\x4DFF' -> 1 - | c >= '\x4E00' && c <= '\xA4C6' -> 2 - | c >= '\xA4D0' && c <= '\xA95F' -> 1 - | c >= '\xA960' && c <= '\xA97C' -> 2 - | c >= '\xA980' && c <= '\xABF9' -> 1 - | c >= '\xAC00' && c <= '\xD7FB' -> 2 - | c >= '\xD800' && c <= '\xDFFF' -> 1 - | c >= '\xE000' && c <= '\xF8FF' -> 1 -- ambiguous - | c >= '\xF900' && c <= '\xFAFF' -> 2 - | c >= '\xFB00' && c <= '\xFDFD' -> 1 - | c >= '\xFE00' && c <= '\xFE0F' -> 1 -- ambiguous - | c >= '\xFE10' && c <= '\xFE19' -> 2 - | c >= '\xFE20' && c <= '\xFE26' -> 1 - | c >= '\xFE30' && c <= '\xFE6B' -> 2 - | c >= '\xFE70' && c <= '\xFEFF' -> 1 - | c >= '\xFF01' && c <= '\xFF60' -> 2 - | c >= '\xFF61' && c <= '\x16A38' -> 1 - | c >= '\x1B000' && c <= '\x1B001' -> 2 - | c >= '\x1D000' && c <= '\x1F1FF' -> 1 - | c >= '\x1F200' && c <= '\x1F251' -> 2 - | c >= '\x1F300' && c <= '\x1F773' -> 1 - | c >= '\x20000' && c <= '\x3FFFD' -> 2 - | otherwise -> 1 +charWidth c = maybe 1 (specificWidth . snd) $ IM.lookupLE (ord c) unicodeWidthMap -- | Get real length of string, taking into account combining and double-wide -- characters. realLength :: HasChars a => a -> Int -realLength s = fromMaybe 0 $ foldlChar go Nothing s +realLength = realLengthWith updateMatchState + +-- | Get real length of string, taking into account combining and double-wide +-- characters, without taking any shortcuts. This should give the same answer +-- as 'updateMatchState', but will be slower. It is here to test that the +-- shortcuts are implemented correctly. +realLengthNoShortcut :: HasChars a => a -> Int +realLengthNoShortcut = realLengthWith updateMatchStateNoShortcut + +-- | Get real length of string, taking into account combining and double-wide +-- characters, using the given accumulator. +realLengthWith :: HasChars a => (MatchState -> Char -> MatchState) -> a -> Int +realLengthWith f = extractLength . foldlChar f (MatchState True 0 0 mempty) + where + extractLength (MatchState _ tot w _) = tot + w + +-- | Update a 'MatchState' by processing a character. +updateMatchState :: MatchState -> Char -> MatchState +updateMatchState (MatchState first tot _ Nothing) !c + -- For efficiency, we isolate commonly used portions of the basic + -- multilingual plane that do not have emoji in them. + -- Maximum contiguous range containing ASCII alphabetic characters and no emoji + | c <= '\x00A8' = MatchState False (tot + 1) 0 Nothing + -- Combining characters have width 0 + | c >= '\x0300' && c <= '\x036F' = MatchState False (if first then tot + 1 else tot) 0 Nothing + -- A block of width 1 + | c >= '\x0370' && c <= '\x10FC' = MatchState False (tot + 1) 0 Nothing + -- Hexagrams are width 1 + | c >= '\x4DC0' && c <= '\x4DFF' = MatchState False (tot + 1) 0 Nothing + -- Maximum contiguous range of width 2 with no emoji containing CJK + | c >= '\x329a' && c <= '\xA4C6' = MatchState False (tot + 2) 0 Nothing + -- An ambiguous block; TODO: should be width 2 if surrounded by wide, 1 otherwise + | c >= '\x3248' && c <= '\x324F' = MatchState False (tot + 1) 0 Nothing + -- A width 1 straggler + | c == '\x303F' = MatchState False (tot + 1) 0 Nothing +updateMatchState s c = updateMatchStateNoShortcut s c + +-- | Update a 'MatchState' by processing a character, without taking any +-- shortcuts. This should give the same answer as 'updateMatchState', but will +-- be slower. It is here to test that the shortcuts are implemented correctly. +updateMatchStateNoShortcut :: MatchState -> Char -> MatchState +updateMatchStateNoShortcut (MatchState first tot _ Nothing) !c = + case IM.lookupLE oc unicodeWidthMap of + -- If there is a specific match, record the tentative width, the map of + -- continuations, and move to the next character + Just (!oc', SpecificMatch r w m) | oc == oc' -> MatchState False tot (fromMaybe r w) (Just m) + -- If there is only a range match, record the total width and move to + -- the next character + Just (!_, !match) -> let r = rangeWidth match + -- If the string starts with a combining character. Since there is no + -- preceding character, we count 0 width as 1 in this one case: + r' = if first && r == 0 then 1 else r + in MatchState False (tot + r') 0 Nothing + -- M.lookupLE should not fail + Nothing -> MatchState False (tot + 1) 0 Nothing + where + oc = ord c +updateMatchStateNoShortcut (MatchState _ tot w (Just !m)) !c + -- Skin tone modifiers and variation modifiers modify the emoji up to this + -- point, so can be discarded. However, they always make it width 2, so we + -- set the tentative width to 2. + | isEmojiModifier c || isEmojiVariation c = MatchState False tot 2 (Just m) + -- Zero width joiners will join two emoji together, so let's discard the state and parse the next emoji + | isEmojiJoiner c = MatchState False tot 2 Nothing + -- Otherwise, lookup the emoji continuations + | otherwise = case IM.lookup (ord c) m of + -- Continuations match, move to the next step with new continuations + Just (Emoji ew m') -> MatchState False tot ew (Just m') + -- No continuations match, use the tentative width and process c without continuations + -- I guess we use shortcuts here; that's probably fine. + Nothing -> updateMatchState (MatchState False (tot + w) 0 Nothing) c + +-- | Keeps track of state in length calculations, determining whether we're at +-- the first character, the width so far, the tentative width for this group, +-- and the Map for possible emoji continuations. +data MatchState = MatchState !Bool !Int !Int !(Maybe EmojiMap) + +-- | A possible match for unicode characters; either within a range block, or a +-- specific match with a block range width, possibly a specific width, and a map of +-- continuations. +data UnicodeWidthMatch + = RangeSeparator !Int -- This code point marks the boundary of a range + | SpecificMatch !Int !(Maybe Int) !EmojiMap -- This code point has a specific emoji with continuations + deriving (Show) + +instance Semigroup UnicodeWidthMatch where + (SpecificMatch r w1 m1) <> (SpecificMatch _ w2 m2) = SpecificMatch r w $ concatEmojiMap m1 m2 + where + w = getSum <$> (Sum <$> w1) <> (Sum <$> w2) + s <> _ = s + +-- | The width of the block in which the character lies, ignoring specific +-- matches. +rangeWidth :: UnicodeWidthMatch -> Int +rangeWidth (RangeSeparator !r) = r +rangeWidth (SpecificMatch !r !_ !_) = r + +-- | The specific width of a character. +specificWidth :: UnicodeWidthMatch -> Int +specificWidth (RangeSeparator r) = r +specificWidth (SpecificMatch r w _) = fromMaybe r w + +-- | Checks whether a character is a skin tone modifier +isEmojiModifier :: Char -> Bool +isEmojiModifier c = c >= '\x1F3FB' && c <= '\x1F3FF' + +-- | Checks whether a character is an emoji variation modifier. +isEmojiVariation :: Char -> Bool +isEmojiVariation c = c == '\xFE0F' + +-- | Checks whether a character is an emoji joiner. +isEmojiJoiner :: Char -> Bool +isEmojiJoiner c = c == '\x200D' + +-- | A map for looking up the width of Unicode text. +unicodeWidthMap :: IM.IntMap UnicodeWidthMatch +unicodeWidthMap = + foldr addEmoji unicodeRangeMap + . filter (maybe True (not . isKeypad . fst) . T.uncons) -- Keypad emoji can be handles by base rules + $ filter (not . T.any isEmojiModifier) -- Emoji modifiers are inferred from the base emoji + baseEmojis + where + isKeypad c = isDigit c || c == '*' || c == '#' + +-- | Denotes the contiguous ranges of Unicode characters which have a given +-- width: 1 for a regular character, 2 for an East Asian wide character. Emoji +-- have different widths and lie within some of these blocks. And the emoji +-- will be added later. +unicodeRangeMap :: IM.IntMap UnicodeWidthMatch +unicodeRangeMap = IM.fromList $ map (\(c, x) -> (ord c, x)) + [ ('\x0000', RangeSeparator 1) + , ('\x0300', RangeSeparator 0) -- combining + , ('\x0370', RangeSeparator 1) + , ('\x1100', RangeSeparator 2) + , ('\x1160', RangeSeparator 1) + , ('\x11A3', RangeSeparator 2) + , ('\x11A8', RangeSeparator 1) + , ('\x11FA', RangeSeparator 2) + , ('\x1200', RangeSeparator 1) + , ('\x1AB0', RangeSeparator 0) -- combining + , ('\x1B00', RangeSeparator 1) + , ('\x1DC0', RangeSeparator 0) -- combining + , ('\x1E00', RangeSeparator 1) + , ('\x200B', RangeSeparator 0) -- zero-width characters and directional overrides + , ('\x2010', RangeSeparator 1) + , ('\x20D0', RangeSeparator 0) -- combining + , ('\x2100', RangeSeparator 1) + , ('\x2329', RangeSeparator 2) + , ('\x232B', RangeSeparator 1) + , ('\x2E80', RangeSeparator 2) + , ('\x303F', RangeSeparator 1) + , ('\x3041', RangeSeparator 2) + , ('\x3248', RangeSeparator 1) -- ambiguous + , ('\x3250', RangeSeparator 2) + , ('\x4DC0', RangeSeparator 1) + , ('\x4E00', RangeSeparator 2) + , ('\xA4D0', RangeSeparator 1) + , ('\xA960', RangeSeparator 2) + , ('\xA980', RangeSeparator 1) + , ('\xAC00', RangeSeparator 2) + , ('\xD800', RangeSeparator 1) + , ('\xE000', RangeSeparator 1) -- ambiguous + , ('\xF900', RangeSeparator 2) + , ('\xFB00', RangeSeparator 1) + , ('\xFE00', RangeSeparator 1) -- ambiguous + , ('\xFE10', RangeSeparator 2) + , ('\xFE20', RangeSeparator 0) -- combining + , ('\xFE30', RangeSeparator 2) + , ('\xFE70', RangeSeparator 1) + , ('\xFF01', RangeSeparator 2) + , ('\xFF61', RangeSeparator 1) + , ('\x1B000', RangeSeparator 2) + , ('\x1D000', RangeSeparator 1) + , ('\x1F200', RangeSeparator 2) + , ('\x1F300', RangeSeparator 1) + , ('\x1F3FB', RangeSeparator 2) -- skin tone modifiers + , ('\x1F400', RangeSeparator 1) + , ('\x20000', RangeSeparator 2) + , ('\x3FFFD', RangeSeparator 1) + ] + +type EmojiMap = IM.IntMap Emoji +data Emoji = Emoji !Int !EmojiMap + deriving (Show) + +concatEmojiMap :: EmojiMap -> EmojiMap -> EmojiMap +concatEmojiMap = IM.unionWith (\(Emoji w e1) (Emoji _ e2) -> Emoji w $ concatEmojiMap e1 e2) + +emojiToMatch :: IM.IntMap UnicodeWidthMatch -> NonEmpty Char -> UnicodeWidthMatch +emojiToMatch m (x:|xs) = SpecificMatch r w . emojiToMap $ filter (not . isEmojiVariation) xs where - -- Using a Maybe allows us to handle the case where the string - -- starts with a combining character. Since there is no preceding - -- character, we count 0 width as 1 in this one case: - go Nothing !c = - case charWidth c of - 0 -> Just 1 - !n -> Just n - go (Just !tot) !c = Just (tot + charWidth c) + r = maybe 1 (rangeWidth . snd) $ IM.lookupLT (ord x) m + -- If it is a single code point emoji, it is of width 2. Otherwise, don't + -- overwrite the range width. + w = if null xs then Just 2 else Nothing + +addEmoji :: Text -> IM.IntMap UnicodeWidthMatch -> IM.IntMap UnicodeWidthMatch +addEmoji !emoji !m = case T.unpack emoji of + [] -> m + x:xs -> IM.insertWith (<>) (ord x) (emojiToMatch m (x:|xs)) m + +emojiToMap :: String -> EmojiMap +emojiToMap [] = mempty +emojiToMap (x:xs) = IM.singleton (ord x) . Emoji 2 $ emojiToMap xs diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/doclayout-0.3.0.2/test/test.hs new/doclayout-0.3.1.1/test/test.hs --- old/doclayout-0.3.0.2/test/test.hs 2019-10-31 03:08:30.000000000 +0100 +++ new/doclayout-0.3.1.1/test/test.hs 2021-10-10 22:42:49.000000000 +0200 @@ -3,9 +3,13 @@ {-# LANGUAGE ScopedTypeVariables #-} import Text.DocLayout +import Text.Emoji import Test.Tasty import Test.Tasty.HUnit +import Test.Tasty.QuickCheck +import Data.Functor ((<&>)) import Data.Text (Text) +import qualified Data.Text as T #if MIN_VERSION_base(4,11,0) #else import Data.Semigroup @@ -268,4 +272,40 @@ Nothing (text "\870" <> space <> text "a") "\870 a" + + , testCase "length of normal text" $ + realLength ("This is going to be too long anyway" :: String) @?= 35 + + , testCase "length of normal character, which could be continued to an emoji, but isn't" $ + realLength ("*a" :: String) @?= 2 + + , testCase "length of normal character, which could be continued to an emoji, and is" $ + realLength ("*\xFE0F\x20E3\&a" :: String) @?= 3 + + , testCase "length emoji consisting of one code point" $ + realLength ("\x231A" :: String) @?= 2 + + , testCase "length of an emoji constructed using the variating modifier" $ + realLength ("\x00A9\xFE0F" :: String) @?= 2 + + , testCase "length of a non-emoji which would be an emoji with a variation modifier" $ + realLength ("\x00A9" :: String) @?= 1 + + , testCase "length of two emoji in a row" $ + realLength ("\x1F170\xFE0F\x1F1E6\x1F1E8" :: String) @?= 4 + + , testCase "length of an emoji with skin tone modifier, where stripping results in a non-emoji" $ + realLength ("\x1F590\x1F3FF" :: String) @?= 2 + + , testCase "a digit with a skin tone modifier is invalid but might appear, and shouldn't be mistaken for a variation modifier" $ + realLength ("1\x1F3FF" :: String) @?= 3 + + , testGroup "all base emoji have width 2" $ + baseEmojis <&> \emoji -> testCase (T.unpack emoji) $ realLength emoji @?= 2 + + , testGroup "all zero-width joiner emoji sequences have width 2" $ + zwjEmojis <&> \emoji -> testCase (T.unpack emoji) $ realLength emoji @?= 2 + + , testProperty "shortcut provides same answer for string length" . withMaxSuccess 1000000 $ + \(x :: String) -> realLength x === realLengthNoShortcut x ]
