Hello community, here is the log from the commit of package ghc-text-metrics for openSUSE:Factory checked in at 2017-08-31 21:00:28 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/ghc-text-metrics (Old) and /work/SRC/openSUSE:Factory/.ghc-text-metrics.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "ghc-text-metrics" Thu Aug 31 21:00:28 2017 rev:3 rq:513514 version:0.3.0 Changes: -------- --- /work/SRC/openSUSE:Factory/ghc-text-metrics/ghc-text-metrics.changes 2017-06-22 10:39:20.558507940 +0200 +++ /work/SRC/openSUSE:Factory/.ghc-text-metrics.new/ghc-text-metrics.changes 2017-08-31 21:00:30.341340143 +0200 @@ -1,0 +2,5 @@ +Thu Jul 27 14:06:28 UTC 2017 - [email protected] + +- Update to version 0.3.0. + +------------------------------------------------------------------- Old: ---- text-metrics-0.2.0.tar.gz text-metrics.cabal New: ---- text-metrics-0.3.0.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ ghc-text-metrics.spec ++++++ --- /var/tmp/diff_new_pack.K6p7GD/_old 2017-08-31 21:00:32.157085026 +0200 +++ /var/tmp/diff_new_pack.K6p7GD/_new 2017-08-31 21:00:32.185081092 +0200 @@ -19,17 +19,18 @@ %global pkg_name text-metrics %bcond_with tests Name: ghc-%{pkg_name} -Version: 0.2.0 +Version: 0.3.0 Release: 0 Summary: Calculate various string metrics efficiently License: BSD-3-Clause Group: Development/Languages/Other Url: https://hackage.haskell.org/package/%{pkg_name} Source0: https://hackage.haskell.org/package/%{pkg_name}-%{version}/%{pkg_name}-%{version}.tar.gz -Source1: https://hackage.haskell.org/package/%{pkg_name}-%{version}/revision/1.cabal#/%{pkg_name}.cabal BuildRequires: ghc-Cabal-devel +BuildRequires: ghc-containers-devel BuildRequires: ghc-rpm-macros BuildRequires: ghc-text-devel +BuildRequires: ghc-vector-devel BuildRoot: %{_tmppath}/%{name}-%{version}-build %if %{with tests} BuildRequires: ghc-QuickCheck-devel @@ -52,7 +53,6 @@ %prep %setup -q -n %{pkg_name}-%{version} -cp -p %{SOURCE1} %{pkg_name}.cabal %build %ghc_lib_build ++++++ text-metrics-0.2.0.tar.gz -> text-metrics-0.3.0.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/CHANGELOG.md new/text-metrics-0.3.0/CHANGELOG.md --- old/text-metrics-0.2.0/CHANGELOG.md 2016-10-09 16:43:02.000000000 +0200 +++ new/text-metrics-0.3.0/CHANGELOG.md 2017-06-13 10:43:59.000000000 +0200 @@ -1,3 +1,13 @@ +## Text Metrics 0.3.0 + +* All functions are now implemented in pure Haskell. + +* All functions return `Int` or `Ratio Int` instead of `Natural` and `Ratio + Natural`. + +* Added `overlap` (returns overlap coefficient) and `jaccard` (returns + Jaccard similarity coefficient). + ## Text Metrics 0.2.0 * Made the `levenshtein`, `levenshteinNorm`, `damerauLevenshtein`, and diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/Data/Text/Metrics.hs new/text-metrics-0.3.0/Data/Text/Metrics.hs --- old/text-metrics-0.2.0/Data/Text/Metrics.hs 2016-10-09 16:43:02.000000000 +0200 +++ new/text-metrics-0.3.0/Data/Text/Metrics.hs 2017-06-13 10:42:49.000000000 +0200 @@ -1,34 +1,24 @@ -- | -- Module : Data.Text.Metrics --- Copyright : © 2016 Mark Karpov +-- Copyright : © 2016–2017 Mark Karpov -- License : BSD 3 clause -- --- Maintainer : Mark Karpov <[email protected]> +-- Maintainer : Mark Karpov <[email protected]> -- Stability : experimental -- Portability : portable -- --- The module provides efficient implementations of various strings metrics. --- It works with strict 'Text' values and returns either 'Natural' numbers --- (because the metrics cannot be negative), or @'Ratio' 'Natural'@ values --- because returned values are rational non-negative numbers by definition. --- --- The functions provided here are the fastest implementations available for --- use in Haskell programs. In fact the functions are implemented in C for --- maximal efficiency, but this leads to a minor flaw. When we work with --- 'Text' values in C, they are represented as UTF-16 encoded strings of --- two-byte values. The algorithms treat the strings as if a character --- corresponds to one element in such strings, which is true for almost all --- modern text data. However, there are characters that are represented by --- two adjoined elements in UTF-16: emoji, historic scripts, less used --- Chinese ideographs, and some more. If input 'Text' of the functions --- contains such characters, the functions may return slightly incorrect --- result. Decide for yourself if this is acceptable for your use case, but --- chances are you will never run into situations when the functions produce --- incorrect results. - -{-# LANGUAGE CPP #-} -{-# LANGUAGE ForeignFunctionInterface #-} -{-# LANGUAGE OverloadedStrings #-} +-- The module provides efficient implementations of various strings metric +-- algorithms. It works with strict 'Text' values. +-- +-- __Note__: before version /0.3.0/ the package used C implementations of +-- the algorithms under the hood. Beginning from version /0.3.0/, the +-- implementations are written in Haskell while staying almost as fast, see: +-- +-- <https://markkarpov.com/post/migrating-text-metrics.html> + +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE MultiWayIf #-} module Data.Text.Metrics ( -- * Levenshtein variants @@ -36,20 +26,25 @@ , levenshteinNorm , damerauLevenshtein , damerauLevenshteinNorm + -- * Treating inputs like sets + , overlap + , jaccard -- * Other , hamming , jaro , jaroWinkler ) where +import Control.Monad +import Control.Monad.ST +import Data.Map.Strict (Map) import Data.Ratio import Data.Text -import Foreign -import Foreign.C.Types -import Numeric.Natural -import System.IO.Unsafe -import qualified Data.Text as T -import qualified Data.Text.Foreign as TF +import GHC.Exts (inline) +import qualified Data.Map.Strict as M +import qualified Data.Text as T +import qualified Data.Text.Unsafe as TU +import qualified Data.Vector.Unboxed.Mutable as VUM #if !MIN_VERSION_base(4,8,0) import Control.Applicative @@ -59,82 +54,233 @@ -- Levenshtein variants -- | Return Levenshtein distance between two 'Text' values. Classic --- Levenshtein distance between two strings is minimal number of operations --- necessary to transform one string into another. For Levenshtein distance --- allowed operations are: deletion, insertion, and substitution. +-- Levenshtein distance between two strings is the minimal number of +-- operations necessary to transform one string into another. For +-- Levenshtein distance allowed operations are: deletion, insertion, and +-- substitution. -- -- See also: <https://en.wikipedia.org/wiki/Levenshtein_distance>. +-- +-- __Heads up__, before version /0.3.0/ this function returned +-- 'Data.Numeric.Natural'. -levenshtein :: Text -> Text -> Natural -levenshtein = withTwo c_levenshtein - -foreign import ccall unsafe "tmetrics_levenshtein" - c_levenshtein :: CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO CUInt +levenshtein :: Text -> Text -> Int +levenshtein a b = fst (levenshtein_ a b) -- | Return normalized Levenshtein distance between two 'Text' values. -- Result is a non-negative rational number (represented as @'Ratio' --- 'Natural'@), where 0 signifies no similarity between the strings, while 1 --- means exact match. The operation is virtually as fast as 'levenshtein'. +-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the +-- strings, while 1 means exact match. -- -- See also: <https://en.wikipedia.org/wiki/Levenshtein_distance>. +-- +-- __Heads up__, before version /0.3.0/ this function returned @'Ratio' +-- 'Data.Numeric.Natural'@. + +levenshteinNorm :: Text -> Text -> Ratio Int +levenshteinNorm = norm levenshtein_ -levenshteinNorm :: Text -> Text -> Ratio Natural -levenshteinNorm = norm levenshtein -{-# INLINE levenshteinNorm #-} +-- | An internal helper, returns Levenshtein distance as the first element +-- of the tuple and max length of the two inputs as the second element of +-- the tuple. + +levenshtein_ :: Text -> Text -> (Int, Int) +levenshtein_ a b + | T.null a = (lenb, lenm) + | T.null b = (lena, lenm) + | otherwise = runST $ do + let v_len = lenb + 1 + v <- VUM.unsafeNew (v_len * 2) + let gov !i = + when (i < v_len) $ do + VUM.unsafeWrite v i i + gov (i + 1) + goi !i !na !v0 !v1 = do + let !(TU.Iter ai da) = TU.iter a na + goj !j !nb = + when (j < lenb) $ do + let !(TU.Iter bj db) = TU.iter b nb + cost = if ai == bj then 0 else 1 + x <- (+ 1) <$> VUM.unsafeRead v (v1 + j) + y <- (+ 1) <$> VUM.unsafeRead v (v0 + j + 1) + z <- (+ cost) <$> VUM.unsafeRead v (v0 + j) + VUM.unsafeWrite v (v1 + j + 1) (min x (min y z)) + goj (j + 1) (nb + db) + when (i < lena) $ do + VUM.unsafeWrite v v1 (i + 1) + goj 0 0 + goi (i + 1) (na + da) v1 v0 + gov 0 + goi 0 0 0 v_len + ld <- VUM.unsafeRead v (lenb + if even lena then 0 else v_len) + return (ld, lenm) + where + lena = T.length a + lenb = T.length b + lenm = max lena lenb +{-# INLINE levenshtein_ #-} -- | Return Damerau-Levenshtein distance between two 'Text' values. The -- function works like 'levenshtein', but the collection of allowed --- operations also includes transposition of two /adjacent/ characters. The --- function is about 20% slower than 'levenshtein', but still pretty fast. +-- operations also includes transposition of two /adjacent/ characters. -- -- See also: <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>. +-- +-- __Heads up__, before version /0.3.0/ this function returned +-- 'Data.Numeric.Natural'. -damerauLevenshtein :: Text -> Text -> Natural -damerauLevenshtein = withTwo c_damerau_levenshtein - -foreign import ccall unsafe "tmetrics_damerau_levenshtein" - c_damerau_levenshtein :: CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO CUInt +damerauLevenshtein :: Text -> Text -> Int +damerauLevenshtein a b = fst (damerauLevenshtein_ a b) -- | Return normalized Damerau-Levenshtein distance between two 'Text' --- values. Result is a non-negative rational number (represented as @'Ratio' --- 'Natural'@), where 0 signifies no similarity between the strings, while 1 --- means exact match. The operation is virtually as fast as --- 'damerauLevenshtein'. +-- values. 0 signifies no similarity between the strings, while 1 means +-- exact match. -- -- See also: <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>. +-- +-- __Heads up__, before version /0.3.0/ this function returned @'Ratio' +-- 'Data.Numeric.Natural'@. + +damerauLevenshteinNorm :: Text -> Text -> Ratio Int +damerauLevenshteinNorm = norm damerauLevenshtein_ + +-- | An internal helper, returns Damerau-Levenshtein distance as the first +-- element of the tuple and max length of the two inputs as the second +-- element of the tuple. + +damerauLevenshtein_ :: Text -> Text -> (Int, Int) +damerauLevenshtein_ a b + | T.null a = (lenb, lenm) + | T.null b = (lena, lenm) + | otherwise = runST $ do + let v_len = lenb + 1 + v <- VUM.unsafeNew (v_len * 3) + let gov !i = + when (i < v_len) $ do + VUM.unsafeWrite v i i + gov (i + 1) + goi !i !na !ai_1 !v0 !v1 !v2 = do + let !(TU.Iter ai da) = TU.iter a na + goj !j !nb !bj_1 = + when (j < lenb) $ do + let !(TU.Iter bj db) = TU.iter b nb + cost = if ai == bj then 0 else 1 + x <- (+ 1) <$> VUM.unsafeRead v (v1 + j) + y <- (+ 1) <$> VUM.unsafeRead v (v0 + j + 1) + z <- (+ cost) <$> VUM.unsafeRead v (v0 + j) + let g = min x (min y z) + val <- (+ cost) <$> VUM.unsafeRead v (v2 + j - 1) + VUM.unsafeWrite v (v1 + j + 1) $ + if i > 0 && j > 0 && ai == bj_1 && ai_1 == bj && val < g + then val + else g + goj (j + 1) (nb + db) bj + when (i < lena) $ do + VUM.unsafeWrite v v1 (i + 1) + goj 0 0 'a' + goi (i + 1) (na + da) ai v1 v2 v0 + gov 0 + goi 0 0 'a' 0 v_len (v_len * 2) + ld <- VUM.unsafeRead v (lenb + (lena `mod` 3) * v_len) + return (ld, lenm) + where + lena = T.length a + lenb = T.length b + lenm = max lena lenb +{-# INLINE damerauLevenshtein_ #-} + +---------------------------------------------------------------------------- +-- Treating inputs like sets + +-- | Return overlap coefficient for two 'Text' values. Returned value is in +-- the range from 0 (no similarity) to 1 (exact match). Return 1 if both +-- 'Text' values are empty. +-- +-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>. +-- +-- @since 0.3.0 + +overlap :: Text -> Text -> Ratio Int +overlap a b = + if d == 0 + then 1 % 1 + else intersectionSize (mkTextMap a) (mkTextMap b) % d + where + d = min (T.length a) (T.length b) + +-- | Return Jaccard similarity coefficient for two 'Text' values. Returned +-- value is in the range from 0 (no similarity) to 1 (exact match). Return 1 +-- if both +-- +-- See also: <https://en.wikipedia.org/wiki/Jaccard_index> +-- +-- @since 0.3.0 + +jaccard :: Text -> Text -> Ratio Int +jaccard a b = + if d == 0 + then 1 % 1 + else intersectionSize ma mb % d + where + ma = mkTextMap a + mb = mkTextMap b + d = unionSize ma mb + +-- | Make a map from 'Char' to 'Int' representing how many times the 'Char' +-- appears in the input 'Text'. + +mkTextMap :: Text -> Map Char Int +mkTextMap = T.foldl' f M.empty + where + f m ch = M.insertWith (+) ch 1 m +{-# INLINE mkTextMap #-} + +-- | Return intersection size between two 'Text'-maps. + +intersectionSize :: Map Char Int -> Map Char Int -> Int +intersectionSize a b = M.foldl' (+) 0 (M.intersectionWith min a b) +{-# INLINE intersectionSize #-} + +-- | Return union size between two 'Text'-maps. -damerauLevenshteinNorm :: Text -> Text -> Ratio Natural -damerauLevenshteinNorm = norm damerauLevenshtein -{-# INLINE damerauLevenshteinNorm #-} +unionSize :: Map Char Int -> Map Char Int -> Int +unionSize a b = M.foldl' (+) 0 (M.unionWith max a b) +{-# INLINE unionSize #-} ---------------------------------------------------------------------------- -- Other -- | /O(n)/ Return Hamming distance between two 'Text' values. Hamming --- distance is defined as number of positions at which the corresponding +-- distance is defined as the number of positions at which the corresponding -- symbols are different. The input 'Text' values should be of equal length -- or 'Nothing' will be returned. -- -- See also: <https://en.wikipedia.org/wiki/Hamming_distance>. +-- +-- __Heads up__, before version /0.3.0/ this function returned @'Maybe' +-- 'Data.Numeric.Natural'@. -hamming :: Text -> Text -> Maybe Natural +hamming :: Text -> Text -> Maybe Int hamming a b = if T.length a == T.length b - then Just . unsafePerformIO . TF.useAsPtr a $ \aptr size -> - TF.useAsPtr b $ \bptr _ -> - fromIntegral <$> c_hamming (fromIntegral size) aptr bptr + then Just (go 0 0 0) else Nothing - -foreign import ccall unsafe "tmetrics_hamming" - c_hamming :: CUInt -> Ptr Word16 -> Ptr Word16 -> IO CUInt + where + go !na !nb !r = + let !(TU.Iter cha da) = TU.iter a na + !(TU.Iter chb db) = TU.iter b nb + in if | na == len -> r + | cha /= chb -> go (na + da) (nb + db) (r + 1) + | otherwise -> go (na + da) (nb + db) r + len = TU.lengthWord16 a -- | Return Jaro distance between two 'Text' values. Returned value is in --- range from 0 (no similarity) to 1 (exact match). +-- the range from 0 (no similarity) to 1 (exact match). -- -- While the algorithm is pretty clear for artificial examples (like those -- from the linked Wikipedia article), for /arbitrary/ strings, it may be -- hard to decide which of two strings should be considered as one having --- “reference” order of characters (since order of matching characters in an +-- “reference” order of characters (order of matching characters in an -- essential part of the definition of the algorithm). This makes us -- consider the first string the “reference” string (with correct order of -- characters). Thus generally, @@ -144,71 +290,98 @@ -- This asymmetry can be found in all implementations of the algorithm on -- the internet, AFAIK. -- --- See also: <http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance> +-- See also: <https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance> -- -- @since 0.2.0 +-- +-- __Heads up__, before version /0.3.0/ this function returned @'Ratio' +-- 'Data.Numeric.Natural'@. -jaro :: Text -> Text -> Ratio Natural -jaro = jaroCommon (\_ _ _ _ x -> return x) - -jaroCommon :: (CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> Ratio Natural -> IO (Ratio Natural)) -> Text -> Text -> Ratio Natural -jaroCommon f a b = unsafePerformIO $ alloca $ \m' -> alloca $ \t' -> - TF.useAsPtr a $ \aptr asize -> - TF.useAsPtr b $ \bptr bsize -> - if asize == 0 || bsize == 0 - then return (1 % 1) - else do - let asize' = fromIntegral asize - bsize' = fromIntegral bsize - c_jaro m' t' asize' aptr bsize' bptr - m <- fromIntegral <$> peek m' - t <- fromIntegral <$> peek t' - f asize' aptr bsize' bptr $ - if m == 0 - then 0 - else ((m % fromIntegral asize) + - (m % fromIntegral bsize) + - ((m - t) % m)) / 3 -{-# INLINE jaroCommon #-} - -foreign import ccall unsafe "tmetrics_jaro" - c_jaro :: Ptr CUInt -> Ptr CUInt -> CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO () +jaro :: Text -> Text -> Ratio Int +jaro a b = + if T.null a || T.null b + then 0 % 1 + else runST $ do + let lena = T.length a + lenb = T.length b + d = + if lena >= 2 && lenb >= 2 + then max lena lenb `quot` 2 - 1 + else 0 + v <- VUM.replicate lenb (0 :: Int) + r <- VUM.replicate 3 (0 :: Int) -- tj, m, t + let goi !i !na !fromb = do + let !(TU.Iter ai da) = TU.iter a na + (from, fromb') = + if i >= d + then (i - d, fromb + TU.iter_ b fromb) + else (0, 0) + to = min (i + d + 1) lenb + goj !j !nb = + when (j < to) $ do + let !(TU.Iter bj db) = TU.iter b nb + used <- (== 1) <$> VUM.unsafeRead v j + if not used && ai == bj + then do + tj <- VUM.unsafeRead r 0 + if j < tj + then VUM.unsafeModify r (+ 1) 2 + else VUM.unsafeWrite r 0 j + VUM.unsafeWrite v j 1 + VUM.unsafeModify r (+ 1) 1 + else goj (j + 1) (nb + db) + when (i < lena) $ do + goj from fromb + goi (i + 1) (na + da) fromb' + goi 0 0 0 + m <- VUM.unsafeRead r 1 + t <- VUM.unsafeRead r 2 + return $ + if m == 0 + then 0 % 1 + else ((m % lena) + + (m % lenb) + + ((m - t) % m)) / 3 -- | Return Jaro-Winkler distance between two 'Text' values. Returned value -- is in range from 0 (no similarity) to 1 (exact match). -- --- See also: <http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance> +-- See also: <https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance> -- -- @since 0.2.0 +-- +-- __Heads up__, before version /0.3.0/ this function returned @'Ratio' +-- 'Data.Numeric.Natural'@. -jaroWinkler :: Text -> Text -> Ratio Natural -jaroWinkler = jaroCommon g +jaroWinkler :: Text -> Text -> Ratio Int +jaroWinkler a b = dj + (1 % 10) * l * (1 - dj) where - g asize aptr bsize bptr dj = do - l <- fromIntegral <$> c_common_prefix asize aptr bsize bptr - return (dj + (1 % 10) * l * (1 - dj)) + dj = inline (jaro a b) + l = fromIntegral (commonPrefix a b) -foreign import ccall unsafe "tmetrics_common_prefix" - c_common_prefix :: CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO CUInt +-- | Return length of common prefix two 'Text' values have. + +commonPrefix :: Text -> Text -> Int +commonPrefix a b = go 0 0 0 + where + go !na !nb !r = + let !(TU.Iter cha da) = TU.iter a na + !(TU.Iter chb db) = TU.iter b nb + in if | na == lena -> r + | nb == lenb -> r + | cha == chb -> go (na + da) (nb + db) (r + 1) + | otherwise -> r + lena = TU.lengthWord16 a + lenb = TU.lengthWord16 b +{-# INLINE commonPrefix #-} ---------------------------------------------------------------------------- -- Helpers -withTwo - :: (CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO CUInt) - -> Text - -> Text - -> Natural -withTwo f a b = - unsafePerformIO . TF.useAsPtr a $ \aptr asize -> - TF.useAsPtr b $ \bptr bsize -> - fromIntegral <$> f (fromIntegral asize) aptr (fromIntegral bsize) bptr -{-# INLINE withTwo #-} - -norm :: (Text -> Text -> Natural) -> Text -> Text -> Ratio Natural +norm :: (Text -> Text -> (Int, Int)) -> Text -> Text -> Ratio Int norm f a b = - let r = f a b + let (r, l) = f a b in if r == 0 then 1 % 1 - else 1 % 1 - r % fromIntegral (max (T.length a) (T.length b)) + else 1 % 1 - r % l {-# INLINE norm #-} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/LICENSE.md new/text-metrics-0.3.0/LICENSE.md --- old/text-metrics-0.2.0/LICENSE.md 2016-01-03 14:37:56.000000000 +0100 +++ new/text-metrics-0.3.0/LICENSE.md 2017-06-02 17:36:51.000000000 +0200 @@ -1,4 +1,4 @@ -Copyright © 2016 Mark Karpov +Copyright © 2016–2017 Mark Karpov All rights reserved. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/README.md new/text-metrics-0.3.0/README.md --- old/text-metrics-0.2.0/README.md 2016-10-09 16:43:02.000000000 +0200 +++ new/text-metrics-0.3.0/README.md 2017-06-13 10:42:49.000000000 +0200 @@ -7,76 +7,44 @@ [](https://travis-ci.org/mrkkrp/text-metrics) [](https://coveralls.io/github/mrkkrp/text-metrics?branch=master) -The library provides efficient implementations of various strings metrics. -It works with strict `Text` values and returns either `Natural` numbers -(because the metrics cannot be negative), or `Ratio Natural` values because -returned values are rational non-negative numbers by definition. - -The functions provided here are the fastest implementations available for -use in Haskell programs. In fact the functions are implemented in C for -maximal efficiency, but this leads to a minor flaw. When we work with `Text` -values in C, they are represented as UTF-16 encoded strings of two-byte -values. The algorithms treat the strings as if a character corresponds to -one element in such strings, which is true for almost all modern text data. -However, there are characters that are represented by two adjoined elements -in UTF-16: emoji, historic scripts, less used Chinese ideographs, and some -more. If input `Text` of the functions contains such characters, the -functions may return slightly incorrect result. Decide for yourself if this -is acceptable for your use case, but chances are you will never run into -situations when the functions produce incorrect results. +The library provides efficient implementations of various strings metric +algorithms. It works with strict `Text` values. The current version of the package implements: -* [Levenshtein distance](http://en.wikipedia.org/wiki/Levenshtein_distance) -* [Normalized Levenshtein distance](http://en.wikipedia.org/wiki/Levenshtein_distance) -* [Damerau-Levenshtein distance](http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) -* [Normalized Damerau-Levenshtein distance](http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) -* [Hamming distance](http://en.wikipedia.org/wiki/Hamming_distance) -* [Jaro distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) -* [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) - -TODO list: - -* [Overlap coefficient](http://en.wikipedia.org/wiki/Overlap_coefficient) -* [Jaccard similarity coefficient](http://en.wikipedia.org/wiki/Jaccard_index) +* [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) +* [Normalized Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) +* [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) +* [Normalized Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) +* [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) +* [Jaro distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) +* [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) +* [Overlap coefficient](https://en.wikipedia.org/wiki/Overlap_coefficient) +* [Jaccard similarity coefficient](https://en.wikipedia.org/wiki/Jaccard_index) ## Comparison with the `edit-distance` package -There -is [`edit-distance`](https://hackage.haskell.org/package/edit-distance) -package whose scope overlaps with the scope of this package. The differences -are: +There is [`edit-distance`](https://hackage.haskell.org/package/edit-distance) package whose scope overlaps with the scope of +this package. The differences are: * `edit-distance` allows to specify costs for every operation when calculating Levenshtein distance (insertion, deletion, substitution, and transposition). This is rarely needed though in real-world applications, IMO. -* `edit-distance` only provides single Levenshtein distance, `text-metrics` - aims to provide implementations of most string metrics algorithms. +* `edit-distance` only provides Levenshtein distance, `text-metrics` aims to + provide implementations of most string metrics algorithms. * `edit-distance` works on `Strings`, while `text-metrics` works on strict `Text` values. -* As `README.md` of `edit-distance` states, “[the algorithms] have been - fairly heavily optimized”, which is apparently true, yet the - `text-metrics` is faster for short strings (under 64 characters) and even - faster for longer strings (scales better). How much faster? For short - strings more than ×3, and about ×4 for longer strings. - ## Implementation -All “meat” of the algorithms is written in C in a rather straightforward -way. Levenshtein variants are based on the “iterative algorithm with two -matrix rows” from Wikipedia with additional improvement that we do not copy -current row of distances into previous row, but just swap the pointers -(which is OK, since the arrays have equal length and current row will be -overwritten in the next iteration anyway). - -Normalized versions are defined as thin (inlined) Haskell wrappers. +Although we originally used C for speed, currently all functions are pure +Haskell tuned for performance. See [this blog post](https://markkarpov.com/post/migrating-text-metrics.html) for more info. ## License -Copyright © 2016 Mark Karpov +Copyright © 2016–2017 Mark Karpov Distributed under BSD 3 clause license. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/bench/Main.hs new/text-metrics-0.3.0/bench/Main.hs --- old/text-metrics-0.2.0/bench/Main.hs 2016-10-09 16:43:02.000000000 +0200 +++ new/text-metrics-0.3.0/bench/Main.hs 1970-01-01 01:00:00.000000000 +0100 @@ -1,64 +0,0 @@ --- --- Benchmarks for the ‘text-metrics’ package. --- --- Copyright © 2016 Mark Karpov <[email protected]> --- --- Redistribution and use in source and binary forms, with or without --- modification, are permitted provided that the following conditions are --- met: --- --- * Redistributions of source code must retain the above copyright notice, --- this list of conditions and the following disclaimer. --- --- * Redistributions in binary form must reproduce the above copyright --- notice, this list of conditions and the following disclaimer in the --- documentation and/or other materials provided with the distribution. --- --- * Neither the name Mark Karpov nor the names of contributors may be used --- to endorse or promote products derived from this software without --- specific prior written permission. --- --- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY --- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED --- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE --- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY --- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL --- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS --- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) --- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, --- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN --- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE --- POSSIBILITY OF SUCH DAMAGE. - -module Main (main) where - -import Control.DeepSeq -import Criterion.Main -import Data.Text (Text) -import Data.Text.Metrics -import qualified Data.Text as T - -main :: IO () -main = defaultMain - [ btmetric "levenshtein" levenshtein - , btmetric "levenshteinNorm" levenshteinNorm - , btmetric "damerauLevenshtein" damerauLevenshtein - , btmetric "damerauLevenshteinNorm" damerauLevenshteinNorm - , btmetric "hamming" hamming - , btmetric "jaro" jaro - , btmetric "jaroWinkler" jaroWinkler ] - --- | Produce benchmark group to test - -btmetric :: NFData a => String -> (Text -> Text -> a) -> Benchmark -btmetric name f = bgroup name (bs <$> stdSeries) - where - bs n = env (return (testData n, testData n)) (bench (show n) . nf (uncurry f)) - --- | The series of lengths to try with every function as part of 'btmetric'. - -stdSeries :: [Int] -stdSeries = [5,10,20,40,80,160] - -testData :: Int -> Text -testData n = T.pack . take n . drop (n `mod` 4) . cycle $ ['a'..'z'] diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/bench-memory/Main.hs new/text-metrics-0.3.0/bench-memory/Main.hs --- old/text-metrics-0.2.0/bench-memory/Main.hs 1970-01-01 01:00:00.000000000 +0100 +++ new/text-metrics-0.3.0/bench-memory/Main.hs 2017-06-13 10:42:49.000000000 +0200 @@ -0,0 +1,38 @@ +module Main (main) where + +import Control.DeepSeq +import Control.Monad +import Data.Text (Text) +import Data.Text.Metrics +import Weigh +import qualified Data.Text as T + +main :: IO () +main = mainWith $ do + setColumns [Case, Allocated, GCs, Max] + bmetric "levenshtein" levenshtein + bmetric "levenshteinNorm" levenshteinNorm + bmetric "damerauLevenshtein" damerauLevenshtein + bmetric "damerauLevenshteinNorm" damerauLevenshteinNorm + bmetric "overlap" overlap + bmetric "jaccard" jaccard + bmetric "hamming" hamming + bmetric "jaro" jaro + bmetric "jaroWinkler" jaroWinkler + +-- | Perform a series to measurements with the same metric function. + +bmetric :: NFData a + => String -- ^ Name of the benchmark group + -> (Text -> Text -> a) -- ^ The function to benchmark + -> Weigh () +bmetric name f = forM_ stdSeries $ \n -> + func (name ++ "/" ++ show n) (uncurry f) (testData n, testData n) + +-- | The series of lengths to try with every function as part of 'btmetric'. + +stdSeries :: [Int] +stdSeries = [5,10,20,40,80,160] + +testData :: Int -> Text +testData n = T.pack . take n . drop (n `mod` 4) . cycle $ ['a'..'z'] diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/bench-speed/Main.hs new/text-metrics-0.3.0/bench-speed/Main.hs --- old/text-metrics-0.2.0/bench-speed/Main.hs 1970-01-01 01:00:00.000000000 +0100 +++ new/text-metrics-0.3.0/bench-speed/Main.hs 2017-06-13 10:42:49.000000000 +0200 @@ -0,0 +1,34 @@ +module Main (main) where + +import Control.DeepSeq +import Criterion.Main +import Data.Text (Text) +import Data.Text.Metrics +import qualified Data.Text as T + +main :: IO () +main = defaultMain + [ btmetric "levenshtein" levenshtein + , btmetric "levenshteinNorm" levenshteinNorm + , btmetric "damerauLevenshtein" damerauLevenshtein + , btmetric "damerauLevenshteinNorm" damerauLevenshteinNorm + , btmetric "overlap" overlap + , btmetric "jaccard" jaccard + , btmetric "hamming" hamming + , btmetric "jaro" jaro + , btmetric "jaroWinkler" jaroWinkler ] + +-- | Produce benchmark group to test. + +btmetric :: NFData a => String -> (Text -> Text -> a) -> Benchmark +btmetric name f = bgroup name (bs <$> stdSeries) + where + bs n = env (return (testData n, testData n)) (bench (show n) . nf (uncurry f)) + +-- | The series of lengths to try with every function as part of 'btmetric'. + +stdSeries :: [Int] +stdSeries = [5,10,20,40,80,160] + +testData :: Int -> Text +testData n = T.pack . take n . drop (n `mod` 4) . cycle $ ['a'..'z'] diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/cbits/text_metrics.c new/text-metrics-0.3.0/cbits/text_metrics.c --- old/text-metrics-0.2.0/cbits/text_metrics.c 2016-10-09 16:43:02.000000000 +0200 +++ new/text-metrics-0.3.0/cbits/text_metrics.c 1970-01-01 01:00:00.000000000 +0100 @@ -1,207 +0,0 @@ -/* - * This file is part of ‘text-metrics’ package. - * - * Copyright © 2016 Mark Karpov - * - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * * Neither the name Mark Karpov nor the names of contributors may be used to - * endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS - * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN - * NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, - * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, - * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "text_metrics.h" - -/* Levenshtein variants */ - -unsigned int tmetrics_levenshtein (unsigned int la, uint16_t *a, unsigned int lb, uint16_t *b) -{ - if (la == 0) return lb; - if (lb == 0) return la; - - unsigned int v_len = lb + 1, *v0, *v1, i, j; - - if (v_len > VLEN_MAX) - { - v0 = malloc(sizeof(unsigned int) * v_len); - v1 = malloc(sizeof(unsigned int) * v_len); - } - else - { - v0 = alloca(sizeof(unsigned int) * v_len); - v1 = alloca(sizeof(unsigned int) * v_len); - } - - for (i = 0; i < v_len; i++) - v0[i] = i; - - for (i = 0; i < la; i++) - { - v1[0] = i + 1; - - for (j = 0; j < lb; j++) - { - unsigned int cost = *(a + i) == *(b + j) ? 0 : 1; - unsigned int x = *(v1 + j) + 1; - unsigned int y = *(v0 + j + 1) + 1; - unsigned int z = *(v0 + j) + cost; - *(v1 + j + 1) = MIN(x, MIN(y, z)); - } - - unsigned int *ptr = v0; - v0 = v1; - v1 = ptr; - } - - unsigned int result = *(v0 + lb); - - if (v_len > VLEN_MAX) - { - free(v0); - free(v1); - } - - return result; -} - -unsigned int tmetrics_damerau_levenshtein (unsigned int la, uint16_t *a, unsigned int lb, uint16_t *b) -{ - if (la == 0) return lb; - if (lb == 0) return la; - - unsigned int v_len = lb + 1, *v0, *v1, *v2, i, j; - - if (v_len > VLEN_MAX) - { - v0 = malloc(sizeof(unsigned int) * v_len); - v1 = malloc(sizeof(unsigned int) * v_len); - v2 = malloc(sizeof(unsigned int) * v_len); - } - else - { - v0 = alloca(sizeof(unsigned int) * v_len); - v1 = alloca(sizeof(unsigned int) * v_len); - v2 = alloca(sizeof(unsigned int) * v_len); - } - - for (i = 0; i < v_len; i++) - v0[i] = i; - - for (i = 0; i < la; i++) - { - v1[0] = i + 1; - - for (j = 0; j < lb; j++) - { - unsigned int cost = *(a + i) == *(b + j) ? 0 : 1; - unsigned int x = *(v1 + j) + 1; - unsigned int y = *(v0 + j + 1) + 1; - unsigned int z = *(v0 + j) + cost; - *(v1 + j + 1) = MIN(x, MIN(y, z)); - unsigned int val = *(v2 + j - 1) + cost; - if ( i > 0 && - j > 0 && - *(a + i) == *(b + j - 1) && - *(a + i - 1) == *(b + j) && - val < *(v1 + j + 1) ) - *(v1 + j + 1) = val; - } - - unsigned int *ptr = v0; - v0 = v1; - v1 = v2; - v2 = ptr; - } - - unsigned int result = *(v0 + lb); - - if (v_len > VLEN_MAX) - { - free(v0); - free(v1); - free(v2); - } - - return result; -} - -/* Other */ - -unsigned int tmetrics_hamming (unsigned int len, uint16_t *a, uint16_t *b) -{ - unsigned int acc = 0, i; - for (i = 0; i < len; i++) - { - if (*(a + i) != *(b + i)) acc++; - } - return acc; -} - -void tmetrics_jaro (unsigned int *m, unsigned int *t, unsigned int la, uint16_t *a, unsigned int lb, uint16_t *b) -{ - unsigned int d = 0, i, j, tj = 0, from, to; - char *v; - - *m = 0, *t = 0; - - if (la >= 2 && lb >= 2) - d = MAX(lb, la) / 2 - 1; - - if (lb > VLEN_MAX) v = malloc(sizeof(char) * lb); - else v = alloca(sizeof(char) * lb); - - for (i = 0; i < lb; i++) *(v + i) = 0; - - for (i = 0; i < la; i++) - { - from = i < d ? 0 : i - d; - to = MIN(i + d + 1, lb); - for (j = from; j < to; j++) - { - if (*(v + j)) continue; - - if (*(a + i) == *(b + j)) - { - if (j < tj) (*t)++; - else tj = j; - *(v + j) = 1; - (*m)++; - break; - } - } - } - - if (lb > VLEN_MAX) free(v); -} - -unsigned int tmetrics_common_prefix (unsigned int la, uint16_t *a, unsigned int lb, uint16_t *b) -{ - unsigned int acc = 0, i, l = MIN(la, lb); - for (i = 0; i < l; i++) - { - if (*(a + i) == *(b + i)) acc++; - else break; - } - return acc; -} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/cbits/text_metrics.h new/text-metrics-0.3.0/cbits/text_metrics.h --- old/text-metrics-0.2.0/cbits/text_metrics.h 2016-10-09 16:43:02.000000000 +0200 +++ new/text-metrics-0.3.0/cbits/text_metrics.h 1970-01-01 01:00:00.000000000 +0100 @@ -1,56 +0,0 @@ -/* - * This file is part of ‘text-metrics’ package. - * - * Copyright © 2016 Mark Karpov - * - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * * Neither the name Mark Karpov nor the names of contributors may be used to - * endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS - * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN - * NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, - * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, - * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef TEXT_METRICS_H -#define TEXT_METRICS_H - -#include <stdint.h> -#include <stdlib.h> - -#define MAX(a, b) ((a) > (b) ? (a) : (b)) -#define MIN(a, b) ((a) < (b) ? (a) : (b)) - -#define VLEN_MAX 255 /* Up to this length we use alloca. */ - -/* Levenshein variants */ - -unsigned int tmetrics_levenshtein (unsigned int, uint16_t *, unsigned int, uint16_t *); -unsigned int tmetrics_damerau_levenshtein (unsigned int, uint16_t *, unsigned int, uint16_t *); - -/* Other */ - -unsigned int tmetrics_hamming (unsigned int, uint16_t *, uint16_t *); -void tmetrics_jaro (unsigned int *, unsigned int *, unsigned int, uint16_t *, unsigned int, uint16_t *); -unsigned int tmetrics_common_prefix (unsigned int, uint16_t *, unsigned int, uint16_t *); - -#endif /* TEXT_METRICS_H */ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/tests/Main.hs new/text-metrics-0.3.0/tests/Main.hs --- old/text-metrics-0.2.0/tests/Main.hs 2016-10-09 16:43:02.000000000 +0200 +++ new/text-metrics-0.3.0/tests/Main.hs 2017-06-13 10:42:49.000000000 +0200 @@ -1,35 +1,3 @@ --- --- Tests for the ‘text-metrics’ package. --- --- Copyright © 2016 Mark Karpov <[email protected]> --- --- Redistribution and use in source and binary forms, with or without --- modification, are permitted provided that the following conditions are --- met: --- --- * Redistributions of source code must retain the above copyright notice, --- this list of conditions and the following disclaimer. --- --- * Redistributions in binary form must reproduce the above copyright --- notice, this list of conditions and the following disclaimer in the --- documentation and/or other materials provided with the distribution. --- --- * Neither the name Mark Karpov nor the names of contributors may be used --- to endorse or promote products derived from this software without --- specific prior written permission. --- --- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY --- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED --- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE --- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY --- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL --- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS --- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) --- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, --- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN --- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE --- POSSIBILITY OF SUCH DAMAGE. - {-# LANGUAGE CPP #-} {-# LANGUAGE OverloadedStrings #-} {-# OPTIONS_GHC -fno-warn-orphans #-} @@ -62,6 +30,9 @@ testPair levenshtein "cake" "drake" 2 testPair levenshtein "saturday" "sunday" 3 testPair levenshtein "red" "wax" 3 +#if __GLASGOW_HASKELL__ >= 710 + testPair levenshtein "a😀c" "abc" 1 +#endif testPair levenshtein "lucky" "lucky" 0 testPair levenshtein "" "" 0 describe "levenshteinNorm" $ do @@ -70,6 +41,9 @@ testPair levenshteinNorm "cake" "drake" (3 % 5) testPair levenshteinNorm "saturday" "sunday" (5 % 8) testPair levenshteinNorm "red" "wax" (0 % 1) +#if __GLASGOW_HASKELL__ >= 710 + testPair levenshteinNorm "a😀c" "abc" (2 % 3) +#endif testPair levenshteinNorm "lucky" "lucky" (1 % 1) testPair levenshteinNorm "" "" (1 % 1) describe "damerauLevenshtein" $ do @@ -79,6 +53,9 @@ testPair damerauLevenshtein "nose" "ones" 2 testPair damerauLevenshtein "thing" "sign" 3 testPair damerauLevenshtein "red" "wax" 3 +#if __GLASGOW_HASKELL__ >= 710 + testPair damerauLevenshtein "a😀c" "abc" 1 +#endif testPair damerauLevenshtein "lucky" "lucky" 0 testPair damerauLevenshtein "" "" 0 describe "damerauLevenshteinNorm" $ do @@ -88,6 +65,9 @@ testPair damerauLevenshteinNorm "nose" "ones" (1 % 2) testPair damerauLevenshteinNorm "thing" "sign" (2 % 5) testPair damerauLevenshteinNorm "red" "wax" (0 % 1) +#if __GLASGOW_HASKELL__ >= 710 + testPair damerauLevenshteinNorm "a😀c" "abc" (2 % 3) +#endif testPair damerauLevenshteinNorm "lucky" "lucky" (1 % 1) testPair damerauLevenshteinNorm "" "" (1 % 1) describe "hamming" $ do @@ -98,6 +78,9 @@ testPair hamming "2173896" "2233796" (Just 3) testPair hamming "toned" "roses" (Just 3) testPair hamming "red" "wax" (Just 3) +#if __GLASGOW_HASKELL__ >= 710 + testPair hamming "a😀c" "abc" (Just 1) +#endif testPair hamming "lucky" "lucky" (Just 0) testPair hamming "" "" (Just 0) testPair hamming "small" "big" Nothing @@ -117,7 +100,10 @@ testPair jaro "five" "ten" (0 % 1) testPair jaro "ten" "five" (0 % 1) testPair jaro "lucky" "lucky" (1 % 1) - testPair jaro "" "" (1 % 1) +#if __GLASGOW_HASKELL__ >= 710 + testPair jaro "a😀c" "abc" (7 % 9) +#endif + testPair jaro "" "" (0 % 1) describe "jaroWinkler" $ do testPair jaroWinkler "aa" "a" (17 % 20) testPair jaroWinkler "a" "aa" (17 % 20) @@ -134,7 +120,29 @@ testPair jaroWinkler "five" "ten" (0 % 1) testPair jaroWinkler "ten" "five" (0 % 1) testPair jaroWinkler "lucky" "lucky" (1 % 1) - testPair jaroWinkler "" "" (1 % 1) +#if __GLASGOW_HASKELL__ >= 710 + testPair jaroWinkler "a😀c" "abc" (4 % 5) +#endif + testPair jaroWinkler "" "" (0 % 1) + describe "overlap" $ do + testSwap overlap + testPair overlap "fly" "butterfly" (1 % 1) + testPair overlap "night" "nacht" (3 % 5) + testPair overlap "context" "contact" (5 % 7) + testPair overlap "red" "wax" (0 % 1) +#if __GLASGOW_HASKELL__ >= 710 + testPair overlap "a😀c" "abc" (2 % 3) +#endif + testPair overlap "lucky" "lucky" (1 % 1) + describe "jaccard" $ do + testSwap jaccard + testPair jaccard "xxx" "xyx" (1 % 2) + testPair jaccard "night" "nacht" (3 % 7) + testPair jaccard "context" "contact" (5 % 9) +#if __GLASGOW_HASKELL__ >= 710 + testPair overlap "a😀c" "abc" (2 % 3) +#endif + testPair jaccard "lucky" "lucky" (1 % 1) -- | Test that given function returns the same results when order of -- arguments is swapped. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/text-metrics-0.2.0/text-metrics.cabal new/text-metrics-0.3.0/text-metrics.cabal --- old/text-metrics-0.2.0/text-metrics.cabal 2016-10-09 16:43:36.000000000 +0200 +++ new/text-metrics-0.3.0/text-metrics.cabal 2017-06-13 12:04:50.000000000 +0200 @@ -1,42 +1,11 @@ --- --- Cabal configuration for ‘text-metrics’ package. --- --- Copyright © 2016 Mark Karpov <[email protected]> --- --- Redistribution and use in source and binary forms, with or without --- modification, are permitted provided that the following conditions are --- met: --- --- * Redistributions of source code must retain the above copyright notice, --- this list of conditions and the following disclaimer. --- --- * Redistributions in binary form must reproduce the above copyright --- notice, this list of conditions and the following disclaimer in the --- documentation and/or other materials provided with the distribution. --- --- * Neither the name Mark Karpov nor the names of contributors may be used --- to endorse or promote products derived from this software without --- specific prior written permission. --- --- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY --- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED --- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE --- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY --- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL --- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS --- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) --- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, --- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN --- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE --- POSSIBILITY OF SUCH DAMAGE. - name: text-metrics -version: 0.2.0 +version: 0.3.0 cabal-version: >= 1.10 +tested-with: GHC==7.8.4, GHC==7.10.3, GHC==8.0.2, GHC==8.2.1 license: BSD3 license-file: LICENSE.md -author: Mark Karpov <[email protected]> -maintainer: Mark Karpov <[email protected]> +author: Mark Karpov <[email protected]> +maintainer: Mark Karpov <[email protected]> homepage: https://github.com/mrkkrp/text-metrics bug-reports: https://github.com/mrkkrp/text-metrics/issues category: Text, Algorithms @@ -45,7 +14,6 @@ description: Calculate various string metrics efficiently. extra-doc-files: CHANGELOG.md , README.md -extra-source-files: cbits/*.h source-repository head type: git @@ -58,12 +26,10 @@ library build-depends: base >= 4.7 && < 5.0 + , containers >= 0.5.6.2 && < 0.6 , text >= 0.2 && < 1.3 - if !impl(ghc >= 7.10) - build-depends: nats == 1.* + , vector >= 0.11 && < 0.13 exposed-modules: Data.Text.Metrics - c-sources: cbits/text_metrics.c - include-dirs: cbits if flag(dev) ghc-options: -Wall -Werror else @@ -78,28 +44,39 @@ , base >= 4.7 && < 5.0 , hspec >= 2.0 && < 3.0 , text >= 0.2 && < 1.3 - , text-metrics >= 0.2.0 - if !impl(ghc >= 7.10) - build-depends: nats == 1.* + , text-metrics if flag(dev) ghc-options: -Wall -Werror else ghc-options: -O2 -Wall default-language: Haskell2010 -benchmark bench +benchmark bench-speed main-is: Main.hs - hs-source-dirs: bench + hs-source-dirs: bench-speed type: exitcode-stdio-1.0 - build-depends: base >= 4.7 && < 5.0 - , criterion >= 0.6.2.1 && < 1.2 - , deepseq >= 1.4 && < 1.5 + build-depends: base >= 4.7 && < 5.0 + , criterion >= 0.6.2.1 && < 1.3 + , deepseq >= 1.4 && < 1.5 , text >= 0.2 && < 1.3 - , text-metrics >= 0.2.0 - if !impl(ghc >= 7.10) - build-depends: nats == 1.* + , text-metrics if flag(dev) - ghc-options: -Wall -Werror + ghc-options: -O2 -Wall -Werror + else + ghc-options: -O2 -Wall + default-language: Haskell2010 + +benchmark bench-memory + main-is: Main.hs + hs-source-dirs: bench-memory + type: exitcode-stdio-1.0 + build-depends: base >= 4.7 && < 5.0 + , deepseq >= 1.4 && < 1.5 + , text >= 0.2 && < 1.3 + , text-metrics + , weigh >= 0.0.4 + if flag(dev) + ghc-options: -O2 -Wall -Werror else ghc-options: -O2 -Wall default-language: Haskell2010
