commit ghc-text-metrics for openSUSE:Factory

root Thu, 31 Aug 2017 12:00:43 -0700

Hello community,

here is the log from the commit of package ghc-text-metrics for 
openSUSE:Factory checked in at 2017-08-31 21:00:28
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/ghc-text-metrics (Old)
 and      /work/SRC/openSUSE:Factory/.ghc-text-metrics.new (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "ghc-text-metrics"

Thu Aug 31 21:00:28 2017 rev:3 rq:513514 version:0.3.0

Changes:
--------
--- /work/SRC/openSUSE:Factory/ghc-text-metrics/ghc-text-metrics.changes        
2017-06-22 10:39:20.558507940 +0200
+++ /work/SRC/openSUSE:Factory/.ghc-text-metrics.new/ghc-text-metrics.changes   
2017-08-31 21:00:30.341340143 +0200
@@ -1,0 +2,5 @@
+Thu Jul 27 14:06:28 UTC 2017 - [email protected]
+
+- Update to version 0.3.0.
+
+-------------------------------------------------------------------

Old:
----
  text-metrics-0.2.0.tar.gz
  text-metrics.cabal

New:
----
  text-metrics-0.3.0.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ ghc-text-metrics.spec ++++++
--- /var/tmp/diff_new_pack.K6p7GD/_old  2017-08-31 21:00:32.157085026 +0200
+++ /var/tmp/diff_new_pack.K6p7GD/_new  2017-08-31 21:00:32.185081092 +0200
@@ -19,17 +19,18 @@
 %global pkg_name text-metrics
 %bcond_with tests
 Name:           ghc-%{pkg_name}
-Version:        0.2.0
+Version:        0.3.0
 Release:        0
 Summary:        Calculate various string metrics efficiently
 License:        BSD-3-Clause
 Group:          Development/Languages/Other
 Url:            https://hackage.haskell.org/package/%{pkg_name}
 Source0:        
https://hackage.haskell.org/package/%{pkg_name}-%{version}/%{pkg_name}-%{version}.tar.gz
-Source1:        
https://hackage.haskell.org/package/%{pkg_name}-%{version}/revision/1.cabal#/%{pkg_name}.cabal
 BuildRequires:  ghc-Cabal-devel
+BuildRequires:  ghc-containers-devel
 BuildRequires:  ghc-rpm-macros
 BuildRequires:  ghc-text-devel
+BuildRequires:  ghc-vector-devel
 BuildRoot:      %{_tmppath}/%{name}-%{version}-build
 %if %{with tests}
 BuildRequires:  ghc-QuickCheck-devel
@@ -52,7 +53,6 @@
 
 %prep
 %setup -q -n %{pkg_name}-%{version}
-cp -p %{SOURCE1} %{pkg_name}.cabal
 
 %build
 %ghc_lib_build

++++++ text-metrics-0.2.0.tar.gz -> text-metrics-0.3.0.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/text-metrics-0.2.0/CHANGELOG.md 
new/text-metrics-0.3.0/CHANGELOG.md
--- old/text-metrics-0.2.0/CHANGELOG.md 2016-10-09 16:43:02.000000000 +0200
+++ new/text-metrics-0.3.0/CHANGELOG.md 2017-06-13 10:43:59.000000000 +0200
@@ -1,3 +1,13 @@
+## Text Metrics 0.3.0
+
+* All functions are now implemented in pure Haskell.
+
+* All functions return `Int` or `Ratio Int` instead of `Natural` and `Ratio
+  Natural`.
+
+* Added `overlap` (returns overlap coefficient) and `jaccard` (returns
+  Jaccard similarity coefficient).
+
 ## Text Metrics 0.2.0
 
 * Made the `levenshtein`, `levenshteinNorm`, `damerauLevenshtein`, and
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/text-metrics-0.2.0/Data/Text/Metrics.hs 
new/text-metrics-0.3.0/Data/Text/Metrics.hs
--- old/text-metrics-0.2.0/Data/Text/Metrics.hs 2016-10-09 16:43:02.000000000 
+0200
+++ new/text-metrics-0.3.0/Data/Text/Metrics.hs 2017-06-13 10:42:49.000000000 
+0200
@@ -1,34 +1,24 @@
 -- |
 -- Module      :  Data.Text.Metrics
--- Copyright   :  © 2016 Mark Karpov
+-- Copyright   :  © 2016–2017 Mark Karpov
 -- License     :  BSD 3 clause
 --
--- Maintainer  :  Mark Karpov <[email protected]>
+-- Maintainer  :  Mark Karpov <[email protected]>
 -- Stability   :  experimental
 -- Portability :  portable
 --
--- The module provides efficient implementations of various strings metrics.
--- It works with strict 'Text' values and returns either 'Natural' numbers
--- (because the metrics cannot be negative), or @'Ratio' 'Natural'@ values
--- because returned values are rational non-negative numbers by definition.
---
--- The functions provided here are the fastest implementations available for
--- use in Haskell programs. In fact the functions are implemented in C for
--- maximal efficiency, but this leads to a minor flaw. When we work with
--- 'Text' values in C, they are represented as UTF-16 encoded strings of
--- two-byte values. The algorithms treat the strings as if a character
--- corresponds to one element in such strings, which is true for almost all
--- modern text data. However, there are characters that are represented by
--- two adjoined elements in UTF-16: emoji, historic scripts, less used
--- Chinese ideographs, and some more. If input 'Text' of the functions
--- contains such characters, the functions may return slightly incorrect
--- result. Decide for yourself if this is acceptable for your use case, but
--- chances are you will never run into situations when the functions produce
--- incorrect results.
-
-{-# LANGUAGE CPP                      #-}
-{-# LANGUAGE ForeignFunctionInterface #-}
-{-# LANGUAGE OverloadedStrings        #-}
+-- The module provides efficient implementations of various strings metric
+-- algorithms. It works with strict 'Text' values.
+--
+-- __Note__: before version /0.3.0/ the package used C implementations of
+-- the algorithms under the hood. Beginning from version /0.3.0/, the
+-- implementations are written in Haskell while staying almost as fast, see:
+--
+-- <https://markkarpov.com/post/migrating-text-metrics.html>
+
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE CPP          #-}
+{-# LANGUAGE MultiWayIf   #-}
 
 module Data.Text.Metrics
   ( -- * Levenshtein variants
@@ -36,20 +26,25 @@
   , levenshteinNorm
   , damerauLevenshtein
   , damerauLevenshteinNorm
+    -- * Treating inputs like sets
+  , overlap
+  , jaccard
     -- * Other
   , hamming
   , jaro
   , jaroWinkler )
 where
 
+import Control.Monad
+import Control.Monad.ST
+import Data.Map.Strict (Map)
 import Data.Ratio
 import Data.Text
-import Foreign
-import Foreign.C.Types
-import Numeric.Natural
-import System.IO.Unsafe
-import qualified Data.Text           as T
-import qualified Data.Text.Foreign   as TF
+import GHC.Exts (inline)
+import qualified Data.Map.Strict as M
+import qualified Data.Text                   as T
+import qualified Data.Text.Unsafe            as TU
+import qualified Data.Vector.Unboxed.Mutable as VUM
 
 #if !MIN_VERSION_base(4,8,0)
 import Control.Applicative
@@ -59,82 +54,233 @@
 -- Levenshtein variants
 
 -- | Return Levenshtein distance between two 'Text' values. Classic
--- Levenshtein distance between two strings is minimal number of operations
--- necessary to transform one string into another. For Levenshtein distance
--- allowed operations are: deletion, insertion, and substitution.
+-- Levenshtein distance between two strings is the minimal number of
+-- operations necessary to transform one string into another. For
+-- Levenshtein distance allowed operations are: deletion, insertion, and
+-- substitution.
 --
 -- See also: <https://en.wikipedia.org/wiki/Levenshtein_distance>.
+--
+-- __Heads up__, before version /0.3.0/ this function returned
+-- 'Data.Numeric.Natural'.
 
-levenshtein :: Text -> Text -> Natural
-levenshtein = withTwo c_levenshtein
-
-foreign import ccall unsafe "tmetrics_levenshtein"
-  c_levenshtein :: CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO CUInt
+levenshtein :: Text -> Text -> Int
+levenshtein a b = fst (levenshtein_ a b)
 
 -- | Return normalized Levenshtein distance between two 'Text' values.
 -- Result is a non-negative rational number (represented as @'Ratio'
--- 'Natural'@), where 0 signifies no similarity between the strings, while 1
--- means exact match. The operation is virtually as fast as 'levenshtein'.
+-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
+-- strings, while 1 means exact match.
 --
 -- See also: <https://en.wikipedia.org/wiki/Levenshtein_distance>.
+--
+-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
+-- 'Data.Numeric.Natural'@.
+
+levenshteinNorm :: Text -> Text -> Ratio Int
+levenshteinNorm = norm levenshtein_
 
-levenshteinNorm :: Text -> Text -> Ratio Natural
-levenshteinNorm = norm levenshtein
-{-# INLINE levenshteinNorm #-}
+-- | An internal helper, returns Levenshtein distance as the first element
+-- of the tuple and max length of the two inputs as the second element of
+-- the tuple.
+
+levenshtein_ :: Text -> Text -> (Int, Int)
+levenshtein_ a b
+  | T.null a = (lenb, lenm)
+  | T.null b = (lena, lenm)
+  | otherwise = runST $ do
+      let v_len = lenb + 1
+      v <- VUM.unsafeNew (v_len * 2)
+      let gov !i =
+            when (i < v_len) $ do
+              VUM.unsafeWrite v i i
+              gov (i + 1)
+          goi !i !na !v0 !v1 = do
+            let !(TU.Iter ai da) = TU.iter a na
+                goj !j !nb =
+                  when (j < lenb) $ do
+                    let !(TU.Iter bj db) = TU.iter b nb
+                        cost = if ai == bj then 0 else 1
+                    x <- (+ 1) <$> VUM.unsafeRead v (v1 + j)
+                    y <- (+ 1) <$> VUM.unsafeRead v (v0 + j + 1)
+                    z <- (+ cost) <$> VUM.unsafeRead v (v0 + j)
+                    VUM.unsafeWrite v (v1 + j + 1) (min x (min y z))
+                    goj (j + 1) (nb + db)
+            when (i < lena) $ do
+              VUM.unsafeWrite v v1 (i + 1)
+              goj 0 0
+              goi (i + 1) (na + da) v1 v0
+      gov 0
+      goi 0 0 0 v_len
+      ld <- VUM.unsafeRead v (lenb + if even lena then 0 else v_len)
+      return (ld, lenm)
+  where
+    lena = T.length a
+    lenb = T.length b
+    lenm = max lena lenb
+{-# INLINE levenshtein_ #-}
 
 -- | Return Damerau-Levenshtein distance between two 'Text' values. The
 -- function works like 'levenshtein', but the collection of allowed
--- operations also includes transposition of two /adjacent/ characters. The
--- function is about 20% slower than 'levenshtein', but still pretty fast.
+-- operations also includes transposition of two /adjacent/ characters.
 --
 -- See also: 
<https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>.
+--
+-- __Heads up__, before version /0.3.0/ this function returned
+-- 'Data.Numeric.Natural'.
 
-damerauLevenshtein :: Text -> Text -> Natural
-damerauLevenshtein = withTwo c_damerau_levenshtein
-
-foreign import ccall unsafe "tmetrics_damerau_levenshtein"
-  c_damerau_levenshtein :: CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO 
CUInt
+damerauLevenshtein :: Text -> Text -> Int
+damerauLevenshtein a b = fst (damerauLevenshtein_ a b)
 
 -- | Return normalized Damerau-Levenshtein distance between two 'Text'
--- values. Result is a non-negative rational number (represented as @'Ratio'
--- 'Natural'@), where 0 signifies no similarity between the strings, while 1
--- means exact match. The operation is virtually as fast as
--- 'damerauLevenshtein'.
+-- values. 0 signifies no similarity between the strings, while 1 means
+-- exact match.
 --
 -- See also: 
<https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>.
+--
+-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
+-- 'Data.Numeric.Natural'@.
+
+damerauLevenshteinNorm :: Text -> Text -> Ratio Int
+damerauLevenshteinNorm = norm damerauLevenshtein_
+
+-- | An internal helper, returns Damerau-Levenshtein distance as the first
+-- element of the tuple and max length of the two inputs as the second
+-- element of the tuple.
+
+damerauLevenshtein_ :: Text -> Text -> (Int, Int)
+damerauLevenshtein_ a b
+  | T.null a = (lenb, lenm)
+  | T.null b = (lena, lenm)
+  | otherwise = runST $ do
+      let v_len = lenb + 1
+      v <- VUM.unsafeNew (v_len * 3)
+      let gov !i =
+            when (i < v_len) $ do
+              VUM.unsafeWrite v i i
+              gov (i + 1)
+          goi !i !na !ai_1 !v0 !v1 !v2 = do
+            let !(TU.Iter ai da) = TU.iter a na
+                goj !j !nb !bj_1 =
+                  when (j < lenb) $ do
+                    let !(TU.Iter bj db) = TU.iter b nb
+                        cost = if ai == bj then 0 else 1
+                    x <- (+ 1) <$> VUM.unsafeRead v (v1 + j)
+                    y <- (+ 1) <$> VUM.unsafeRead v (v0 + j + 1)
+                    z <- (+ cost) <$> VUM.unsafeRead v (v0 + j)
+                    let g = min x (min y z)
+                    val <- (+ cost) <$> VUM.unsafeRead v (v2 + j - 1)
+                    VUM.unsafeWrite v (v1 + j + 1) $
+                      if i > 0 && j > 0 && ai == bj_1 && ai_1 == bj && val < g
+                        then val
+                        else g
+                    goj (j + 1) (nb + db) bj
+            when (i < lena) $ do
+              VUM.unsafeWrite v v1 (i + 1)
+              goj 0 0 'a'
+              goi (i + 1) (na + da) ai v1 v2 v0
+      gov 0
+      goi 0 0 'a' 0 v_len (v_len * 2)
+      ld <- VUM.unsafeRead v (lenb + (lena `mod` 3) * v_len)
+      return (ld, lenm)
+  where
+    lena = T.length a
+    lenb = T.length b
+    lenm = max lena lenb
+{-# INLINE damerauLevenshtein_ #-}
+
+----------------------------------------------------------------------------
+-- Treating inputs like sets
+
+-- | Return overlap coefficient for two 'Text' values. Returned value is in
+-- the range from 0 (no similarity) to 1 (exact match). Return 1 if both
+-- 'Text' values are empty.
+--
+-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
+--
+-- @since 0.3.0
+
+overlap :: Text -> Text -> Ratio Int
+overlap a b =
+  if d == 0
+    then 1 % 1
+    else intersectionSize (mkTextMap a) (mkTextMap b) % d
+  where
+    d = min (T.length a) (T.length b)
+
+-- | Return Jaccard similarity coefficient for two 'Text' values. Returned
+-- value is in the range from 0 (no similarity) to 1 (exact match). Return 1
+-- if both
+--
+-- See also: <https://en.wikipedia.org/wiki/Jaccard_index>
+--
+-- @since 0.3.0
+
+jaccard :: Text -> Text -> Ratio Int
+jaccard a b =
+  if d == 0
+    then 1 % 1
+    else intersectionSize ma mb % d
+  where
+    ma = mkTextMap a
+    mb = mkTextMap b
+    d  = unionSize ma mb
+
+-- | Make a map from 'Char' to 'Int' representing how many times the 'Char'
+-- appears in the input 'Text'.
+
+mkTextMap :: Text -> Map Char Int
+mkTextMap = T.foldl' f M.empty
+  where
+    f m ch = M.insertWith (+) ch 1 m
+{-# INLINE mkTextMap #-}
+
+-- | Return intersection size between two 'Text'-maps.
+
+intersectionSize :: Map Char Int -> Map Char Int -> Int
+intersectionSize a b = M.foldl' (+) 0 (M.intersectionWith min a b)
+{-# INLINE intersectionSize #-}
+
+-- | Return union size between two 'Text'-maps.
 
-damerauLevenshteinNorm :: Text -> Text -> Ratio Natural
-damerauLevenshteinNorm = norm damerauLevenshtein
-{-# INLINE damerauLevenshteinNorm #-}
+unionSize :: Map Char Int -> Map Char Int -> Int
+unionSize a b = M.foldl' (+) 0 (M.unionWith max a b)
+{-# INLINE unionSize #-}
 
 ----------------------------------------------------------------------------
 -- Other
 
 -- | /O(n)/ Return Hamming distance between two 'Text' values. Hamming
--- distance is defined as number of positions at which the corresponding
+-- distance is defined as the number of positions at which the corresponding
 -- symbols are different. The input 'Text' values should be of equal length
 -- or 'Nothing' will be returned.
 --
 -- See also: <https://en.wikipedia.org/wiki/Hamming_distance>.
+--
+-- __Heads up__, before version /0.3.0/ this function returned @'Maybe'
+-- 'Data.Numeric.Natural'@.
 
-hamming :: Text -> Text -> Maybe Natural
+hamming :: Text -> Text -> Maybe Int
 hamming a b =
   if T.length a == T.length b
-    then Just . unsafePerformIO . TF.useAsPtr a $ \aptr size ->
-      TF.useAsPtr b $ \bptr _ ->
-        fromIntegral <$> c_hamming (fromIntegral size) aptr bptr
+    then Just (go 0 0 0)
     else Nothing
-
-foreign import ccall unsafe "tmetrics_hamming"
-  c_hamming :: CUInt -> Ptr Word16 -> Ptr Word16 -> IO CUInt
+  where
+    go !na !nb !r =
+      let !(TU.Iter cha da) = TU.iter a na
+          !(TU.Iter chb db) = TU.iter b nb
+      in if | na  == len -> r
+            | cha /= chb -> go (na + da) (nb + db) (r + 1)
+            | otherwise  -> go (na + da) (nb + db) r
+    len = TU.lengthWord16 a
 
 -- | Return Jaro distance between two 'Text' values. Returned value is in
--- range from 0 (no similarity) to 1 (exact match).
+-- the range from 0 (no similarity) to 1 (exact match).
 --
 -- While the algorithm is pretty clear for artificial examples (like those
 -- from the linked Wikipedia article), for /arbitrary/ strings, it may be
 -- hard to decide which of two strings should be considered as one having
--- “reference” order of characters (since order of matching characters in an
+-- “reference” order of characters (order of matching characters in an
 -- essential part of the definition of the algorithm). This makes us
 -- consider the first string the “reference” string (with correct order of
 -- characters). Thus generally,
@@ -144,71 +290,98 @@
 -- This asymmetry can be found in all implementations of the algorithm on
 -- the internet, AFAIK.
 --
--- See also: <http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance>
+-- See also: <https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance>
 --
 -- @since 0.2.0
+--
+-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
+-- 'Data.Numeric.Natural'@.
 
-jaro :: Text -> Text -> Ratio Natural
-jaro = jaroCommon (\_ _ _ _ x -> return x)
-
-jaroCommon :: (CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> Ratio Natural -> 
IO (Ratio Natural)) -> Text -> Text -> Ratio Natural
-jaroCommon f a b = unsafePerformIO $ alloca $ \m' -> alloca $ \t' ->
-  TF.useAsPtr a $ \aptr asize ->
-    TF.useAsPtr b $ \bptr bsize ->
-      if asize == 0 || bsize == 0
-        then return (1 % 1)
-        else do
-          let asize' = fromIntegral asize
-              bsize' = fromIntegral bsize
-          c_jaro m' t' asize' aptr bsize' bptr
-          m <- fromIntegral <$> peek m'
-          t <- fromIntegral <$> peek t'
-          f asize' aptr bsize' bptr $
-            if m == 0
-              then 0
-              else ((m % fromIntegral asize) +
-                    (m % fromIntegral bsize) +
-                    ((m - t) % m)) / 3
-{-# INLINE jaroCommon #-}
-
-foreign import ccall unsafe "tmetrics_jaro"
-  c_jaro :: Ptr CUInt -> Ptr CUInt -> CUInt -> Ptr Word16 -> CUInt -> Ptr 
Word16 -> IO ()
+jaro :: Text -> Text -> Ratio Int
+jaro a b =
+  if T.null a || T.null b
+    then 0 % 1
+    else runST $ do
+      let lena = T.length a
+          lenb = T.length b
+          d =
+            if lena >= 2 && lenb >= 2
+              then max lena lenb `quot` 2 - 1
+              else 0
+      v <- VUM.replicate lenb (0 :: Int)
+      r <- VUM.replicate 3 (0 :: Int) -- tj, m, t
+      let goi !i !na !fromb = do
+            let !(TU.Iter ai da) = TU.iter a na
+                (from, fromb') =
+                  if i >= d
+                    then (i - d, fromb + TU.iter_ b fromb)
+                    else (0, 0)
+                to = min (i + d + 1) lenb
+                goj !j !nb =
+                  when (j < to) $ do
+                    let !(TU.Iter bj db) = TU.iter b nb
+                    used <- (== 1) <$> VUM.unsafeRead v j
+                    if not used && ai == bj
+                      then do
+                        tj <- VUM.unsafeRead r 0
+                        if j < tj
+                          then VUM.unsafeModify r (+ 1) 2
+                          else VUM.unsafeWrite  r 0 j
+                        VUM.unsafeWrite v j 1
+                        VUM.unsafeModify r (+ 1) 1
+                      else goj (j + 1) (nb + db)
+            when (i < lena) $ do
+              goj from fromb
+              goi (i + 1) (na + da) fromb'
+      goi 0 0 0
+      m <- VUM.unsafeRead r 1
+      t <- VUM.unsafeRead r 2
+      return $
+        if m == 0
+          then 0 % 1
+          else ((m % lena) +
+                (m % lenb) +
+                ((m - t) % m)) / 3
 
 -- | Return Jaro-Winkler distance between two 'Text' values. Returned value
 -- is in range from 0 (no similarity) to 1 (exact match).
 --
--- See also: <http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance>
+-- See also: <https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance>
 --
 -- @since 0.2.0
+--
+-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
+-- 'Data.Numeric.Natural'@.
 
-jaroWinkler :: Text -> Text -> Ratio Natural
-jaroWinkler = jaroCommon g
+jaroWinkler :: Text -> Text -> Ratio Int
+jaroWinkler a b = dj + (1 % 10) * l * (1 - dj)
   where
-    g asize aptr bsize bptr dj = do
-      l <- fromIntegral <$> c_common_prefix asize aptr bsize bptr
-      return (dj + (1  % 10) * l * (1 - dj))
+    dj = inline (jaro a b)
+    l  = fromIntegral (commonPrefix a b)
 
-foreign import ccall unsafe "tmetrics_common_prefix"
-  c_common_prefix :: CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO CUInt
+-- | Return length of common prefix two 'Text' values have.
+
+commonPrefix :: Text -> Text -> Int
+commonPrefix a b = go 0 0 0
+  where
+    go !na !nb !r =
+      let !(TU.Iter cha da) = TU.iter a na
+          !(TU.Iter chb db) = TU.iter b nb
+      in if | na == lena -> r
+            | nb == lenb -> r
+            | cha == chb -> go (na + da) (nb + db) (r + 1)
+            | otherwise  -> r
+    lena = TU.lengthWord16 a
+    lenb = TU.lengthWord16 b
+{-# INLINE commonPrefix #-}
 
 ----------------------------------------------------------------------------
 -- Helpers
 
-withTwo
-  :: (CUInt -> Ptr Word16 -> CUInt -> Ptr Word16 -> IO CUInt)
-  -> Text
-  -> Text
-  -> Natural
-withTwo f a b =
-  unsafePerformIO . TF.useAsPtr a $ \aptr asize ->
-    TF.useAsPtr b $ \bptr bsize ->
-      fromIntegral <$> f (fromIntegral asize) aptr (fromIntegral bsize) bptr
-{-# INLINE withTwo #-}
-
-norm :: (Text -> Text -> Natural) -> Text -> Text -> Ratio Natural
+norm :: (Text -> Text -> (Int, Int)) -> Text -> Text -> Ratio Int
 norm f a b =
-  let r = f a b
+  let (r, l) = f a b
   in if r == 0
        then 1 % 1
-       else 1 % 1 - r % fromIntegral (max (T.length a) (T.length b))
+       else 1 % 1 - r % l
 {-# INLINE norm #-}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/text-metrics-0.2.0/LICENSE.md 
new/text-metrics-0.3.0/LICENSE.md
--- old/text-metrics-0.2.0/LICENSE.md   2016-01-03 14:37:56.000000000 +0100
+++ new/text-metrics-0.3.0/LICENSE.md   2017-06-02 17:36:51.000000000 +0200
@@ -1,4 +1,4 @@
-Copyright © 2016 Mark Karpov
+Copyright © 2016–2017 Mark Karpov
 
 All rights reserved.
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/text-metrics-0.2.0/README.md 
new/text-metrics-0.3.0/README.md
--- old/text-metrics-0.2.0/README.md    2016-10-09 16:43:02.000000000 +0200
+++ new/text-metrics-0.3.0/README.md    2017-06-13 10:42:49.000000000 +0200
@@ -7,76 +7,44 @@
 [![Build 
Status](https://travis-ci.org/mrkkrp/text-metrics.svg?branch=master)](https://travis-ci.org/mrkkrp/text-metrics)
 [![Coverage 
Status](https://coveralls.io/repos/mrkkrp/text-metrics/badge.svg?branch=master&service=github)](https://coveralls.io/github/mrkkrp/text-metrics?branch=master)
 
-The library provides efficient implementations of various strings metrics.
-It works with strict `Text` values and returns either `Natural` numbers
-(because the metrics cannot be negative), or `Ratio Natural` values because
-returned values are rational non-negative numbers by definition.
-
-The functions provided here are the fastest implementations available for
-use in Haskell programs. In fact the functions are implemented in C for
-maximal efficiency, but this leads to a minor flaw. When we work with `Text`
-values in C, they are represented as UTF-16 encoded strings of two-byte
-values. The algorithms treat the strings as if a character corresponds to
-one element in such strings, which is true for almost all modern text data.
-However, there are characters that are represented by two adjoined elements
-in UTF-16: emoji, historic scripts, less used Chinese ideographs, and some
-more. If input `Text` of the functions contains such characters, the
-functions may return slightly incorrect result. Decide for yourself if this
-is acceptable for your use case, but chances are you will never run into
-situations when the functions produce incorrect results.
+The library provides efficient implementations of various strings metric
+algorithms. It works with strict `Text` values.
 
 The current version of the package implements:
 
-* [Levenshtein distance](http://en.wikipedia.org/wiki/Levenshtein_distance)
-* [Normalized Levenshtein 
distance](http://en.wikipedia.org/wiki/Levenshtein_distance)
-* [Damerau-Levenshtein 
distance](http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
-* [Normalized Damerau-Levenshtein 
distance](http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
-* [Hamming distance](http://en.wikipedia.org/wiki/Hamming_distance)
-* [Jaro distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
-* [Jaro-Winkler 
distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
-
-TODO list:
-
-* [Overlap coefficient](http://en.wikipedia.org/wiki/Overlap_coefficient)
-* [Jaccard similarity coefficient](http://en.wikipedia.org/wiki/Jaccard_index)
+* [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
+* [Normalized Levenshtein 
distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
+* [Damerau-Levenshtein 
distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
+* [Normalized Damerau-Levenshtein 
distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
+* [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance)
+* [Jaro distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
+* [Jaro-Winkler 
distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
+* [Overlap coefficient](https://en.wikipedia.org/wiki/Overlap_coefficient)
+* [Jaccard similarity coefficient](https://en.wikipedia.org/wiki/Jaccard_index)
 
 ## Comparison with the `edit-distance` package
 
-There
-is [`edit-distance`](https://hackage.haskell.org/package/edit-distance)
-package whose scope overlaps with the scope of this package. The differences
-are:
+There is [`edit-distance`](https://hackage.haskell.org/package/edit-distance) 
package whose scope overlaps with the scope of
+this package. The differences are:
 
 * `edit-distance` allows to specify costs for every operation when
   calculating Levenshtein distance (insertion, deletion, substitution, and
   transposition). This is rarely needed though in real-world applications,
   IMO.
 
-* `edit-distance` only provides single Levenshtein distance, `text-metrics`
-  aims to provide implementations of most string metrics algorithms.
+* `edit-distance` only provides Levenshtein distance, `text-metrics` aims to
+  provide implementations of most string metrics algorithms.
 
 * `edit-distance` works on `Strings`, while `text-metrics` works on strict
   `Text` values.
 
-* As `README.md` of `edit-distance` states, “[the algorithms] have been
-  fairly heavily optimized”, which is apparently true, yet the
-  `text-metrics` is faster for short strings (under 64 characters) and even
-  faster for longer strings (scales better). How much faster? For short
-  strings more than ×3, and about ×4 for longer strings.
-
 ## Implementation
 
-All “meat” of the algorithms is written in C in a rather straightforward
-way. Levenshtein variants are based on the “iterative algorithm with two
-matrix rows” from Wikipedia with additional improvement that we do not copy
-current row of distances into previous row, but just swap the pointers
-(which is OK, since the arrays have equal length and current row will be
-overwritten in the next iteration anyway).
-
-Normalized versions are defined as thin (inlined) Haskell wrappers.
+Although we originally used C for speed, currently all functions are pure
+Haskell tuned for performance. See [this blog 
post](https://markkarpov.com/post/migrating-text-metrics.html) for more info.
 
 ## License
 
-Copyright © 2016 Mark Karpov
+Copyright © 2016–2017 Mark Karpov
 
 Distributed under BSD 3 clause license.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/text-metrics-0.2.0/bench/Main.hs 
new/text-metrics-0.3.0/bench/Main.hs
--- old/text-metrics-0.2.0/bench/Main.hs        2016-10-09 16:43:02.000000000 
+0200
+++ new/text-metrics-0.3.0/bench/Main.hs        1970-01-01 01:00:00.000000000 
+0100
@@ -1,64 +0,0 @@
---
--- Benchmarks for the ‘text-metrics’ package.
---
--- Copyright © 2016 Mark Karpov <[email protected]>
---
--- Redistribution and use in source and binary forms, with or without
--- modification, are permitted provided that the following conditions are
--- met:
---
--- * Redistributions of source code must retain the above copyright notice,
---   this list of conditions and the following disclaimer.
---
--- * Redistributions in binary form must reproduce the above copyright
---   notice, this list of conditions and the following disclaimer in the
---   documentation and/or other materials provided with the distribution.
---
--- * Neither the name Mark Karpov nor the names of contributors may be used
---   to endorse or promote products derived from this software without
---   specific prior written permission.
---
--- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY
--- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
--- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
--- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
--- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
--- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
--- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
--- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
--- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
--- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
--- POSSIBILITY OF SUCH DAMAGE.
-
-module Main (main) where
-
-import Control.DeepSeq
-import Criterion.Main
-import Data.Text (Text)
-import Data.Text.Metrics
-import qualified Data.Text as T
-
-main :: IO ()
-main = defaultMain
-  [ btmetric "levenshtein"            levenshtein
-  , btmetric "levenshteinNorm"        levenshteinNorm
-  , btmetric "damerauLevenshtein"     damerauLevenshtein
-  , btmetric "damerauLevenshteinNorm" damerauLevenshteinNorm
-  , btmetric "hamming"                hamming
-  , btmetric "jaro"                   jaro
-  , btmetric "jaroWinkler"            jaroWinkler ]
-
--- | Produce benchmark group to test
-
-btmetric :: NFData a => String -> (Text -> Text -> a) -> Benchmark
-btmetric name f = bgroup name (bs <$> stdSeries)
-  where
-    bs n = env (return (testData n, testData n)) (bench (show n) . nf (uncurry 
f))
-
--- | The series of lengths to try with every function as part of 'btmetric'.
-
-stdSeries :: [Int]
-stdSeries = [5,10,20,40,80,160]
-
-testData :: Int -> Text
-testData n = T.pack . take n . drop (n `mod` 4) . cycle $ ['a'..'z']
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/text-metrics-0.2.0/bench-memory/Main.hs 
new/text-metrics-0.3.0/bench-memory/Main.hs
--- old/text-metrics-0.2.0/bench-memory/Main.hs 1970-01-01 01:00:00.000000000 
+0100
+++ new/text-metrics-0.3.0/bench-memory/Main.hs 2017-06-13 10:42:49.000000000 
+0200
@@ -0,0 +1,38 @@
+module Main (main) where
+
+import Control.DeepSeq
+import Control.Monad
+import Data.Text (Text)
+import Data.Text.Metrics
+import Weigh
+import qualified Data.Text as T
+
+main :: IO ()
+main = mainWith $ do
+  setColumns [Case, Allocated, GCs, Max]
+  bmetric "levenshtein"            levenshtein
+  bmetric "levenshteinNorm"        levenshteinNorm
+  bmetric "damerauLevenshtein"     damerauLevenshtein
+  bmetric "damerauLevenshteinNorm" damerauLevenshteinNorm
+  bmetric "overlap"                overlap
+  bmetric "jaccard"                jaccard
+  bmetric "hamming"                hamming
+  bmetric "jaro"                   jaro
+  bmetric "jaroWinkler"            jaroWinkler
+
+-- | Perform a series to measurements with the same metric function.
+
+bmetric :: NFData a
+  => String            -- ^ Name of the benchmark group
+  -> (Text -> Text -> a) -- ^ The function to benchmark
+  -> Weigh ()
+bmetric name f = forM_ stdSeries $ \n ->
+  func (name ++ "/" ++ show n) (uncurry f) (testData n, testData n)
+
+-- | The series of lengths to try with every function as part of 'btmetric'.
+
+stdSeries :: [Int]
+stdSeries = [5,10,20,40,80,160]
+
+testData :: Int -> Text
+testData n = T.pack . take n . drop (n `mod` 4) . cycle $ ['a'..'z']
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/text-metrics-0.2.0/bench-speed/Main.hs 
new/text-metrics-0.3.0/bench-speed/Main.hs
--- old/text-metrics-0.2.0/bench-speed/Main.hs  1970-01-01 01:00:00.000000000 
+0100
+++ new/text-metrics-0.3.0/bench-speed/Main.hs  2017-06-13 10:42:49.000000000 
+0200
@@ -0,0 +1,34 @@
+module Main (main) where
+
+import Control.DeepSeq
+import Criterion.Main
+import Data.Text (Text)
+import Data.Text.Metrics
+import qualified Data.Text as T
+
+main :: IO ()
+main = defaultMain
+  [ btmetric "levenshtein"            levenshtein
+  , btmetric "levenshteinNorm"        levenshteinNorm
+  , btmetric "damerauLevenshtein"     damerauLevenshtein
+  , btmetric "damerauLevenshteinNorm" damerauLevenshteinNorm
+  , btmetric "overlap"                overlap
+  , btmetric "jaccard"                jaccard
+  , btmetric "hamming"                hamming
+  , btmetric "jaro"                   jaro
+  , btmetric "jaroWinkler"            jaroWinkler ]
+
+-- | Produce benchmark group to test.
+
+btmetric :: NFData a => String -> (Text -> Text -> a) -> Benchmark
+btmetric name f = bgroup name (bs <$> stdSeries)
+  where
+    bs n = env (return (testData n, testData n)) (bench (show n) . nf (uncurry 
f))
+
+-- | The series of lengths to try with every function as part of 'btmetric'.
+
+stdSeries :: [Int]
+stdSeries = [5,10,20,40,80,160]
+
+testData :: Int -> Text
+testData n = T.pack . take n . drop (n `mod` 4) . cycle $ ['a'..'z']
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/text-metrics-0.2.0/cbits/text_metrics.c 
new/text-metrics-0.3.0/cbits/text_metrics.c
--- old/text-metrics-0.2.0/cbits/text_metrics.c 2016-10-09 16:43:02.000000000 
+0200
+++ new/text-metrics-0.3.0/cbits/text_metrics.c 1970-01-01 01:00:00.000000000 
+0100
@@ -1,207 +0,0 @@
-/*
- * This file is part of ‘text-metrics’ package.
- *
- * Copyright © 2016 Mark Karpov
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * * Redistributions of source code must retain the above copyright notice,
- *   this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- *
- * * Neither the name Mark Karpov nor the names of contributors may be used to
- *   endorse or promote products derived from this software without specific
- *   prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
- * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
- * NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "text_metrics.h"
-
-/* Levenshtein variants */
-
-unsigned int tmetrics_levenshtein (unsigned int la, uint16_t *a, unsigned int 
lb, uint16_t *b)
-{
-  if (la == 0) return lb;
-  if (lb == 0) return la;
-
-  unsigned int v_len = lb + 1, *v0, *v1, i, j;
-
-  if (v_len > VLEN_MAX)
-    {
-      v0 = malloc(sizeof(unsigned int) * v_len);
-      v1 = malloc(sizeof(unsigned int) * v_len);
-    }
-  else
-    {
-      v0 = alloca(sizeof(unsigned int) * v_len);
-      v1 = alloca(sizeof(unsigned int) * v_len);
-    }
-
-  for (i = 0; i < v_len; i++)
-    v0[i] = i;
-
-  for (i = 0; i < la; i++)
-    {
-      v1[0] = i + 1;
-
-      for (j = 0; j < lb; j++)
-        {
-          unsigned int cost = *(a + i) == *(b + j) ? 0 : 1;
-          unsigned int x = *(v1 + j) + 1;
-          unsigned int y = *(v0 + j + 1) + 1;
-          unsigned int z = *(v0 + j) + cost;
-          *(v1 + j + 1) = MIN(x, MIN(y, z));
-        }
-
-      unsigned int *ptr = v0;
-      v0 = v1;
-      v1 = ptr;
-    }
-
-  unsigned int result = *(v0 + lb);
-
-  if (v_len > VLEN_MAX)
-    {
-      free(v0);
-      free(v1);
-    }
-
-  return result;
-}
-
-unsigned int tmetrics_damerau_levenshtein (unsigned int la, uint16_t *a, 
unsigned int lb, uint16_t *b)
-{
-  if (la == 0) return lb;
-  if (lb == 0) return la;
-
-  unsigned int v_len = lb + 1, *v0, *v1, *v2, i, j;
-
-  if (v_len > VLEN_MAX)
-    {
-      v0 = malloc(sizeof(unsigned int) * v_len);
-      v1 = malloc(sizeof(unsigned int) * v_len);
-      v2 = malloc(sizeof(unsigned int) * v_len);
-    }
-  else
-    {
-      v0 = alloca(sizeof(unsigned int) * v_len);
-      v1 = alloca(sizeof(unsigned int) * v_len);
-      v2 = alloca(sizeof(unsigned int) * v_len);
-    }
-
-  for (i = 0; i < v_len; i++)
-    v0[i] = i;
-
-  for (i = 0; i < la; i++)
-    {
-      v1[0] = i + 1;
-
-      for (j = 0; j < lb; j++)
-        {
-          unsigned int cost = *(a + i) == *(b + j) ? 0 : 1;
-          unsigned int x = *(v1 + j) + 1;
-          unsigned int y = *(v0 + j + 1) + 1;
-          unsigned int z = *(v0 + j) + cost;
-          *(v1 + j + 1) = MIN(x, MIN(y, z));
-          unsigned int val = *(v2 + j - 1) + cost;
-          if ( i > 0                    &&
-               j > 0                    &&
-               *(a + i) == *(b + j - 1) &&
-               *(a + i - 1) == *(b + j) &&
-               val < *(v1 + j + 1) )
-            *(v1 + j + 1) = val;
-        }
-
-      unsigned int *ptr = v0;
-      v0 = v1;
-      v1 = v2;
-      v2 = ptr;
-    }
-
-  unsigned int result = *(v0 + lb);
-
-  if (v_len > VLEN_MAX)
-    {
-      free(v0);
-      free(v1);
-      free(v2);
-    }
-
-  return result;
-}
-
-/* Other */
-
-unsigned int tmetrics_hamming (unsigned int len, uint16_t *a, uint16_t *b)
-{
-  unsigned int acc = 0, i;
-  for (i = 0; i < len; i++)
-    {
-      if (*(a + i) != *(b + i)) acc++;
-    }
-  return acc;
-}
-
-void tmetrics_jaro (unsigned int *m, unsigned int *t, unsigned int la, 
uint16_t *a, unsigned int lb, uint16_t *b)
-{
-  unsigned int d = 0, i, j, tj = 0, from, to;
-  char *v;
-
-  *m = 0, *t = 0;
-
-  if (la >= 2 && lb >= 2)
-    d = MAX(lb, la) / 2 - 1;
-
-  if (lb > VLEN_MAX) v = malloc(sizeof(char) * lb);
-  else               v = alloca(sizeof(char) * lb);
-
-  for (i = 0; i < lb; i++) *(v + i) = 0;
-
-  for (i = 0; i < la; i++)
-    {
-      from = i < d ? 0 : i - d;
-      to   = MIN(i + d + 1, lb);
-      for (j = from; j < to; j++)
-        {
-          if (*(v + j)) continue;
-
-          if (*(a + i) == *(b + j))
-            {
-              if (j < tj) (*t)++;
-              else tj = j;
-              *(v + j) = 1;
-              (*m)++;
-              break;
-            }
-        }
-    }
-
-  if (lb > VLEN_MAX) free(v);
-}
-
-unsigned int tmetrics_common_prefix (unsigned int la, uint16_t *a, unsigned 
int lb, uint16_t *b)
-{
-  unsigned int acc = 0, i, l = MIN(la, lb);
-  for (i = 0; i < l; i++)
-    {
-      if (*(a + i) == *(b + i)) acc++;
-      else break;
-    }
-  return acc;
-}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/text-metrics-0.2.0/cbits/text_metrics.h 
new/text-metrics-0.3.0/cbits/text_metrics.h
--- old/text-metrics-0.2.0/cbits/text_metrics.h 2016-10-09 16:43:02.000000000 
+0200
+++ new/text-metrics-0.3.0/cbits/text_metrics.h 1970-01-01 01:00:00.000000000 
+0100
@@ -1,56 +0,0 @@
-/*
- * This file is part of ‘text-metrics’ package.
- *
- * Copyright © 2016 Mark Karpov
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * * Redistributions of source code must retain the above copyright notice,
- *   this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- *
- * * Neither the name Mark Karpov nor the names of contributors may be used to
- *   endorse or promote products derived from this software without specific
- *   prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
- * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
- * NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef TEXT_METRICS_H
-#define TEXT_METRICS_H
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-#define VLEN_MAX 255 /* Up to this length we use alloca. */
-
-/* Levenshein variants */
-
-unsigned int tmetrics_levenshtein (unsigned int, uint16_t *, unsigned int, 
uint16_t *);
-unsigned int tmetrics_damerau_levenshtein (unsigned int, uint16_t *, unsigned 
int, uint16_t *);
-
-/* Other */
-
-unsigned int tmetrics_hamming (unsigned int, uint16_t *, uint16_t *);
-void tmetrics_jaro (unsigned int *, unsigned int *, unsigned int, uint16_t *, 
unsigned int, uint16_t *);
-unsigned int tmetrics_common_prefix (unsigned int, uint16_t *, unsigned int, 
uint16_t *);
-
-#endif /* TEXT_METRICS_H */
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/text-metrics-0.2.0/tests/Main.hs 
new/text-metrics-0.3.0/tests/Main.hs
--- old/text-metrics-0.2.0/tests/Main.hs        2016-10-09 16:43:02.000000000 
+0200
+++ new/text-metrics-0.3.0/tests/Main.hs        2017-06-13 10:42:49.000000000 
+0200
@@ -1,35 +1,3 @@
---
--- Tests for the ‘text-metrics’ package.
---
--- Copyright © 2016 Mark Karpov <[email protected]>
---
--- Redistribution and use in source and binary forms, with or without
--- modification, are permitted provided that the following conditions are
--- met:
---
--- * Redistributions of source code must retain the above copyright notice,
---   this list of conditions and the following disclaimer.
---
--- * Redistributions in binary form must reproduce the above copyright
---   notice, this list of conditions and the following disclaimer in the
---   documentation and/or other materials provided with the distribution.
---
--- * Neither the name Mark Karpov nor the names of contributors may be used
---   to endorse or promote products derived from this software without
---   specific prior written permission.
---
--- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY
--- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
--- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
--- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
--- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
--- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
--- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
--- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
--- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
--- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
--- POSSIBILITY OF SUCH DAMAGE.
-
 {-# LANGUAGE CPP                  #-}
 {-# LANGUAGE OverloadedStrings    #-}
 {-# OPTIONS_GHC -fno-warn-orphans #-}
@@ -62,6 +30,9 @@
       testPair levenshtein "cake"     "drake"   2
       testPair levenshtein "saturday" "sunday"  3
       testPair levenshtein "red"      "wax"     3
+#if __GLASGOW_HASKELL__ >= 710
+      testPair levenshtein "a😀c"     "abc"     1
+#endif
       testPair levenshtein "lucky"    "lucky"   0
       testPair levenshtein ""         ""        0
   describe "levenshteinNorm" $ do
@@ -70,6 +41,9 @@
     testPair levenshteinNorm "cake"     "drake"   (3 % 5)
     testPair levenshteinNorm "saturday" "sunday"  (5 % 8)
     testPair levenshteinNorm "red"      "wax"     (0 % 1)
+#if __GLASGOW_HASKELL__ >= 710
+    testPair levenshteinNorm "a😀c"     "abc"     (2 % 3)
+#endif
     testPair levenshteinNorm "lucky"    "lucky"   (1 % 1)
     testPair levenshteinNorm ""         ""        (1 % 1)
   describe "damerauLevenshtein" $ do
@@ -79,6 +53,9 @@
     testPair damerauLevenshtein "nose"          "ones"      2
     testPair damerauLevenshtein "thing"         "sign"      3
     testPair damerauLevenshtein "red"           "wax"       3
+#if __GLASGOW_HASKELL__ >= 710
+    testPair damerauLevenshtein "a😀c"          "abc"       1
+#endif
     testPair damerauLevenshtein "lucky"         "lucky"     0
     testPair damerauLevenshtein ""              ""          0
   describe "damerauLevenshteinNorm" $ do
@@ -88,6 +65,9 @@
     testPair damerauLevenshteinNorm "nose"          "ones"      (1 % 2)
     testPair damerauLevenshteinNorm "thing"         "sign"      (2 % 5)
     testPair damerauLevenshteinNorm "red"           "wax"       (0 % 1)
+#if __GLASGOW_HASKELL__ >= 710
+    testPair damerauLevenshteinNorm "a😀c"          "abc"       (2 % 3)
+#endif
     testPair damerauLevenshteinNorm "lucky"         "lucky"     (1 % 1)
     testPair damerauLevenshteinNorm ""              ""          (1 % 1)
   describe "hamming" $ do
@@ -98,6 +78,9 @@
     testPair hamming "2173896" "2233796" (Just 3)
     testPair hamming "toned"   "roses"   (Just 3)
     testPair hamming "red"     "wax"     (Just 3)
+#if __GLASGOW_HASKELL__ >= 710
+    testPair hamming "a😀c"    "abc"      (Just 1)
+#endif
     testPair hamming "lucky"   "lucky"   (Just 0)
     testPair hamming ""        ""        (Just 0)
     testPair hamming "small"   "big"     Nothing
@@ -117,7 +100,10 @@
     testPair jaro "five"   "ten"      (0  % 1)
     testPair jaro "ten"    "five"     (0  % 1)
     testPair jaro "lucky"  "lucky"    (1  % 1)
-    testPair jaro ""       ""         (1  % 1)
+#if __GLASGOW_HASKELL__ >= 710
+    testPair jaro "a😀c"   "abc"      (7  % 9)
+#endif
+    testPair jaro ""       ""         (0  % 1)
   describe "jaroWinkler" $ do
     testPair jaroWinkler "aa" "a"            (17 % 20)
     testPair jaroWinkler "a"  "aa"           (17 % 20)
@@ -134,7 +120,29 @@
     testPair jaroWinkler "five"   "ten"      (0  % 1)
     testPair jaroWinkler "ten"    "five"     (0  % 1)
     testPair jaroWinkler "lucky"  "lucky"    (1  % 1)
-    testPair jaroWinkler ""       ""         (1  % 1)
+#if __GLASGOW_HASKELL__ >= 710
+    testPair jaroWinkler "a😀c"   "abc"      (4  % 5)
+#endif
+    testPair jaroWinkler ""       ""         (0  % 1)
+  describe "overlap" $ do
+    testSwap overlap
+    testPair overlap "fly"     "butterfly" (1 % 1)
+    testPair overlap "night"   "nacht"     (3 % 5)
+    testPair overlap "context" "contact"   (5 % 7)
+    testPair overlap "red"     "wax"       (0 % 1)
+#if __GLASGOW_HASKELL__ >= 710
+    testPair overlap "a😀c"   "abc"   (2 % 3)
+#endif
+    testPair overlap "lucky" "lucky" (1 % 1)
+  describe "jaccard" $ do
+    testSwap jaccard
+    testPair jaccard "xxx"     "xyx"     (1 % 2)
+    testPair jaccard "night"   "nacht"   (3 % 7)
+    testPair jaccard "context" "contact" (5 % 9)
+#if __GLASGOW_HASKELL__ >= 710
+    testPair overlap "a😀c"     "abc"     (2 % 3)
+#endif
+    testPair jaccard "lucky"   "lucky"   (1 % 1)
 
 -- | Test that given function returns the same results when order of
 -- arguments is swapped.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/text-metrics-0.2.0/text-metrics.cabal 
new/text-metrics-0.3.0/text-metrics.cabal
--- old/text-metrics-0.2.0/text-metrics.cabal   2016-10-09 16:43:36.000000000 
+0200
+++ new/text-metrics-0.3.0/text-metrics.cabal   2017-06-13 12:04:50.000000000 
+0200
@@ -1,42 +1,11 @@
---
--- Cabal configuration for ‘text-metrics’ package.
---
--- Copyright © 2016 Mark Karpov <[email protected]>
---
--- Redistribution and use in source and binary forms, with or without
--- modification, are permitted provided that the following conditions are
--- met:
---
--- * Redistributions of source code must retain the above copyright notice,
---   this list of conditions and the following disclaimer.
---
--- * Redistributions in binary form must reproduce the above copyright
---   notice, this list of conditions and the following disclaimer in the
---   documentation and/or other materials provided with the distribution.
---
--- * Neither the name Mark Karpov nor the names of contributors may be used
---   to endorse or promote products derived from this software without
---   specific prior written permission.
---
--- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS “AS IS” AND ANY
--- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
--- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
--- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
--- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
--- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
--- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
--- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
--- STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
--- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
--- POSSIBILITY OF SUCH DAMAGE.
-
 name:                 text-metrics
-version:              0.2.0
+version:              0.3.0
 cabal-version:        >= 1.10
+tested-with:          GHC==7.8.4, GHC==7.10.3, GHC==8.0.2, GHC==8.2.1
 license:              BSD3
 license-file:         LICENSE.md
-author:               Mark Karpov <[email protected]>
-maintainer:           Mark Karpov <[email protected]>
+author:               Mark Karpov <[email protected]>
+maintainer:           Mark Karpov <[email protected]>
 homepage:             https://github.com/mrkkrp/text-metrics
 bug-reports:          https://github.com/mrkkrp/text-metrics/issues
 category:             Text, Algorithms
@@ -45,7 +14,6 @@
 description:          Calculate various string metrics efficiently.
 extra-doc-files:      CHANGELOG.md
                     , README.md
-extra-source-files:   cbits/*.h
 
 source-repository head
   type:               git
@@ -58,12 +26,10 @@
 
 library
   build-depends:      base             >= 4.7 && < 5.0
+                    , containers       >= 0.5.6.2 && < 0.6
                     , text             >= 0.2 && < 1.3
-  if !impl(ghc >= 7.10)
-    build-depends:    nats             == 1.*
+                    , vector           >= 0.11 && < 0.13
   exposed-modules:    Data.Text.Metrics
-  c-sources:          cbits/text_metrics.c
-  include-dirs:       cbits
   if flag(dev)
     ghc-options:      -Wall -Werror
   else
@@ -78,28 +44,39 @@
                     , base             >= 4.7 && < 5.0
                     , hspec            >= 2.0 && < 3.0
                     , text             >= 0.2 && < 1.3
-                    , text-metrics     >= 0.2.0
-  if !impl(ghc >= 7.10)
-    build-depends:    nats             == 1.*
+                    , text-metrics
   if flag(dev)
     ghc-options:      -Wall -Werror
   else
     ghc-options:      -O2 -Wall
   default-language:   Haskell2010
 
-benchmark bench
+benchmark bench-speed
   main-is:            Main.hs
-  hs-source-dirs:     bench
+  hs-source-dirs:     bench-speed
   type:               exitcode-stdio-1.0
-  build-depends:      base             >= 4.7     && < 5.0
-                    , criterion        >= 0.6.2.1 && < 1.2
-                    , deepseq          >= 1.4     && < 1.5
+  build-depends:      base             >= 4.7 && < 5.0
+                    , criterion        >= 0.6.2.1 && < 1.3
+                    , deepseq          >= 1.4 && < 1.5
                     , text             >= 0.2 && < 1.3
-                    , text-metrics     >= 0.2.0
-  if !impl(ghc >= 7.10)
-    build-depends:    nats             == 1.*
+                    , text-metrics
   if flag(dev)
-    ghc-options:      -Wall -Werror
+    ghc-options:      -O2 -Wall -Werror
+  else
+    ghc-options:      -O2 -Wall
+  default-language:   Haskell2010
+
+benchmark bench-memory
+  main-is:            Main.hs
+  hs-source-dirs:     bench-memory
+  type:               exitcode-stdio-1.0
+  build-depends:      base             >= 4.7 && < 5.0
+                    , deepseq          >= 1.4 && < 1.5
+                    , text             >= 0.2 && < 1.3
+                    , text-metrics
+                    , weigh            >= 0.0.4
+  if flag(dev)
+    ghc-options:      -O2 -Wall -Werror
   else
     ghc-options:      -O2 -Wall
   default-language:   Haskell2010

commit ghc-text-metrics for openSUSE:Factory

Reply via email to