Hello community, here is the log from the commit of package ghc-clustering for openSUSE:Factory checked in at 2017-03-03 17:48:37 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/ghc-clustering (Old) and /work/SRC/openSUSE:Factory/.ghc-clustering.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "ghc-clustering" Fri Mar 3 17:48:37 2017 rev:2 rq:461614 version:0.3.1 Changes: -------- --- /work/SRC/openSUSE:Factory/ghc-clustering/ghc-clustering.changes 2016-12-11 13:26:57.286826588 +0100 +++ /work/SRC/openSUSE:Factory/.ghc-clustering.new/ghc-clustering.changes 2017-03-03 17:48:38.350337189 +0100 @@ -1,0 +2,5 @@ +Sun Feb 12 14:11:44 UTC 2017 - [email protected] + +- Update to version 0.3.1 with cabal2obs. + +------------------------------------------------------------------- Old: ---- clustering-0.2.1.tar.gz New: ---- clustering-0.3.1.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ ghc-clustering.spec ++++++ --- /var/tmp/diff_new_pack.Zy6y5L/_old 2017-03-03 17:48:39.110229871 +0100 +++ /var/tmp/diff_new_pack.Zy6y5L/_new 2017-03-03 17:48:39.114229307 +0100 @@ -1,7 +1,7 @@ # # spec file for package ghc-clustering # -# Copyright (c) 2016 SUSE LINUX GmbH, Nuernberg, Germany. +# Copyright (c) 2017 SUSE LINUX GmbH, Nuernberg, Germany. # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -19,15 +19,14 @@ %global pkg_name clustering %bcond_with tests Name: ghc-%{pkg_name} -Version: 0.2.1 +Version: 0.3.1 Release: 0 Summary: High performance clustering algorithms License: MIT -Group: System/Libraries +Group: Development/Languages/Other Url: https://hackage.haskell.org/package/%{pkg_name} Source0: https://hackage.haskell.org/package/%{pkg_name}-%{version}/%{pkg_name}-%{version}.tar.gz BuildRequires: ghc-Cabal-devel -# Begin cabal-rpm deps: BuildRequires: ghc-binary-devel BuildRequires: ghc-containers-devel BuildRequires: ghc-matrices-devel @@ -35,6 +34,7 @@ BuildRequires: ghc-parallel-devel BuildRequires: ghc-primitive-devel BuildRequires: ghc-rpm-macros +BuildRequires: ghc-unordered-containers-devel BuildRequires: ghc-vector-devel BuildRoot: %{_tmppath}/%{name}-%{version}-build %if %{with tests} @@ -45,7 +45,6 @@ BuildRequires: ghc-tasty-hunit-devel BuildRequires: ghc-tasty-quickcheck-devel %endif -# End cabal-rpm deps %description Following clutering methods are included in this library: @@ -70,20 +69,14 @@ %prep %setup -q -n %{pkg_name}-%{version} - %build %ghc_lib_build - %install %ghc_lib_install - %check -%if %{with tests} -%{cabal} test -%endif - +%cabal_test %post devel %ghc_pkg_recache ++++++ clustering-0.2.1.tar.gz -> clustering-0.3.1.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/clustering-0.2.1/benchmarks/Bench/KMeans.hs new/clustering-0.3.1/benchmarks/Bench/KMeans.hs --- old/clustering-0.2.1/benchmarks/Bench/KMeans.hs 2015-06-28 23:44:21.000000000 +0200 +++ new/clustering-0.3.1/benchmarks/Bench/KMeans.hs 2016-11-11 21:02:16.000000000 +0100 @@ -1,18 +1,20 @@ module Bench.KMeans ( benchKMeans ) where -import Criterion.Main -import qualified Data.Matrix.Unboxed as MU -import qualified Data.Vector.Unboxed as U -import System.Random.MWC -import System.IO.Unsafe - -import AI.Clustering.KMeans - -import Bench.Utils - -gen :: GenIO -gen = unsafePerformIO createSystemRandom +import Criterion.Main +import qualified Data.Matrix.Unboxed as MU +import qualified Data.Vector.Unboxed as U +import Data.Word +import System.IO.Unsafe +import System.Random.MWC + +import AI.Clustering.KMeans +import Bench.Utils + +gen :: U.Vector Word32 +gen = unsafePerformIO $ do + g <- createSystemRandom + fmap fromSeed $ save g dat :: MU.Matrix Double dat = unsafePerformIO $ fmap MU.fromRows $ randVectors 1000 10 @@ -21,11 +23,12 @@ benchKMeans = bgroup "KMeans clustering" [ bgroup "AI.Clustering.KMeans" [ bench "k-means++ (n = 1000, k = 7)" $ - whnfIO $ kmeans' gen KMeansPP 7 dat + whnf ( \x -> membership $ kmeans 7 x defaultKMeansOpts + { kmeansMethod = KMeansPP + , kmeansSeed = gen } ) dat , bench "forgy (n = 1000, k = 7)" $ - whnfIO $ kmeans' gen Forgy 7 dat + whnf ( \x -> membership $ kmeans 7 x defaultKMeansOpts + { kmeansMethod = KMeansPP + , kmeansSeed = gen } ) dat ] ] - -kmeans' :: GenIO -> Method -> Int -> MU.Matrix Double -> IO (U.Vector Int) -kmeans' g method k = fmap _clusters . kmeans g method k diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/clustering-0.2.1/clustering.cabal new/clustering-0.3.1/clustering.cabal --- old/clustering-0.2.1/clustering.cabal 2015-06-28 23:44:21.000000000 +0200 +++ new/clustering-0.3.1/clustering.cabal 2016-11-11 21:04:13.000000000 +0100 @@ -1,8 +1,5 @@ --- Initial fastcluster.cabal generated by cabal init. For further --- documentation, see http://haskell.org/cabal/users-guide/ - name: clustering -version: 0.2.1 +version: 0.3.1 synopsis: High performance clustering algorithms description: Following clutering methods are included in this library: @@ -20,11 +17,10 @@ copyright: (c) 2015 Kai Zhang category: Math build-type: Simple --- extra-source-files: cabal-version: >=1.10 library - exposed-modules: + exposed-modules: AI.Clustering.Hierarchical AI.Clustering.Hierarchical.Internal AI.Clustering.Hierarchical.Types @@ -33,8 +29,6 @@ AI.Clustering.KMeans.Types AI.Clustering.Utils --- other-modules: - build-depends: base >=4.0 && <5.0 , binary @@ -43,6 +37,7 @@ , mwc-random , parallel , primitive + , unordered-containers , vector hs-source-dirs: src @@ -59,7 +54,7 @@ Test.Utils default-language: Haskell2010 - build-depends: + build-depends: base , binary , mwc-random @@ -84,7 +79,7 @@ Bench.Utils default-language: Haskell2010 - build-depends: + build-depends: base , criterion , mwc-random diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/clustering-0.2.1/src/AI/Clustering/KMeans/Internal.hs new/clustering-0.3.1/src/AI/Clustering/KMeans/Internal.hs --- old/clustering-0.2.1/src/AI/Clustering/KMeans/Internal.hs 2015-06-28 23:44:21.000000000 +0200 +++ new/clustering-0.3.1/src/AI/Clustering/KMeans/Internal.hs 2016-11-10 09:03:18.000000000 +0100 @@ -1,17 +1,6 @@ -{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE BangPatterns #-} {-# LANGUAGE FlexibleContexts #-} --------------------------------------------------------------------------------- --- | --- Module : AI.Clustering.KMeans.Internal --- Copyright : (c) 2015 Kai Zhang --- License : MIT --- --- Maintainer : [email protected] --- Stability : experimental --- Portability : portable --- --- <module description starting at first column> --------------------------------------------------------------------------------- + module AI.Clustering.KMeans.Internal {-# WARNING "To be used by developer only" #-} ( forgy @@ -19,38 +8,39 @@ , sumSquares ) where -import Control.Monad (forM_) -import Control.Monad.Primitive (PrimMonad, PrimState) -import Data.List (nub) -import qualified Data.Matrix.Unboxed as MU -import qualified Data.Vector.Generic as G -import qualified Data.Vector.Unboxed as U -import qualified Data.Vector.Unboxed.Mutable as UM -import System.Random.MWC (uniformR, Gen) +import Control.Monad.Primitive (PrimMonad, PrimState) +import qualified Data.HashSet as S +import Data.List (nub) +import qualified Data.Matrix.Unboxed as MU +import qualified Data.Vector.Generic as G +import qualified Data.Vector.Unboxed as U +import System.Random.MWC (Gen, uniformR) +import System.Random.MWC.Distributions (categorical) + forgy :: (PrimMonad m, G.Vector v a) => Gen (PrimState m) - -> Int -- number of clusters - -> v a -- data - -> (a -> U.Vector Double) + -> Int -- ^ The number of clusters + -> v a -- ^ Input data + -> (a -> U.Vector Double) -- ^ Feature extraction function -> m (MU.Matrix Double) forgy g k dat fn | k > n = error "k is larger than sample size" - | otherwise = iter + | otherwise = loop where - iter = do - vec <- randN g k . U.enumFromN 0 $ n - let xs = map (\i -> fn $ dat `G.unsafeIndex` i) . U.toList $ vec + loop = do + vec <- uniformRN (0, n-1) k g + let xs = map (fn . G.unsafeIndex dat) vec if length (nub xs) == length xs - then return . MU.fromRows $ xs - else iter + then return $ MU.fromRows xs + else loop n = G.length dat {-# INLINE forgy #-} kmeansPP :: (PrimMonad m, G.Vector v a) => Gen (PrimState m) - -> Int - -> v a - -> (a -> U.Vector Double) + -> Int -- ^ The number of clusters + -> v a -- ^ Input data + -> (a -> U.Vector Double) -- ^ Feature extraction function -> m (MU.Matrix Double) kmeansPP g k dat fn | k > n = error "k is larger than sample size" @@ -59,44 +49,27 @@ loop [c1] 1 where loop centers !k' - | k' == k = return $ MU.fromRows $ map (\i -> fn $ dat `G.unsafeIndex` i) centers + | k' == k = return $ MU.fromRows $ map (fn . G.unsafeIndex dat) centers | otherwise = do - c' <- chooseWithProb g $ U.map (shortestDist centers) rowIndices + c' <- flip categorical g $ U.generate n $ \i -> minimum $ + map (\c -> sumSquares (fn $ G.unsafeIndex dat i) (fn $ G.unsafeIndex dat c)) + centers loop (c':centers) (k'+1) - n = G.length dat - rowIndices = U.enumFromN 0 n - shortestDist centers x = minimum $ map (\i -> - sumSquares (fn $ dat `G.unsafeIndex` x) (fn $ dat `G.unsafeIndex` i)) centers {-# INLINE kmeansPP #-} -chooseWithProb :: PrimMonad m - => Gen (PrimState m) - -> U.Vector Double -- ^ weights, may not be normalized - -> m Int -- ^ result/index -chooseWithProb g ws = do - x <- uniformR (0,sum') g - return $ loop x 0 0 - where - loop v !cdf !i | cdf' >= v = i - | otherwise = loop v cdf' (i+1) - where cdf' = cdf + ws `U.unsafeIndex` i - - sum' = U.sum ws -{-# INLINE chooseWithProb #-} - --- | Random select k samples from a population -randN :: PrimMonad m => Gen (PrimState m) -> Int -> U.Vector Int -> m (U.Vector Int) -randN g k xs = do - v <- U.thaw xs - forM_ [0..k-1] $ \i -> do - j <- uniformR (i, lst) g - UM.unsafeSwap v i j - U.unsafeFreeze . UM.take k $ v - where - lst = U.length xs - 1 -{-# INLINE randN #-} - sumSquares :: U.Vector Double -> U.Vector Double -> Double sumSquares xs = U.sum . U.zipWith (\x y -> (x - y)**2) xs {-# INLINE sumSquares #-} + +-- | Generate N non-duplicated uniformly distributed random variables in a given range. +uniformRN :: PrimMonad m => (Int, Int) -> Int -> Gen (PrimState m) -> m [Int] +uniformRN (lo, hi) n g | hi - lo + 1 < n = error "Range is too narrow!" + | otherwise = loop S.empty + where + loop m | S.size m >= n = return $ S.toList m + | otherwise = do + x <- uniformR (lo,hi) g + if x `S.member` m + then loop m + else loop $ S.insert x m diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/clustering-0.2.1/src/AI/Clustering/KMeans/Types.hs new/clustering-0.3.1/src/AI/Clustering/KMeans/Types.hs --- old/clustering-0.2.1/src/AI/Clustering/KMeans/Types.hs 2015-06-28 23:44:21.000000000 +0200 +++ new/clustering-0.3.1/src/AI/Clustering/KMeans/Types.hs 2016-11-10 18:21:34.000000000 +0100 @@ -11,19 +11,36 @@ -- <module description starting at first column> -------------------------------------------------------------------------------- module AI.Clustering.KMeans.Types - ( KMeans(..) + ( KMeansOpts(..) + , defaultKMeansOpts + , KMeans(..) , Method(..) ) where import qualified Data.Matrix.Unboxed as MU import qualified Data.Vector.Unboxed as U +import Data.Word (Word32) + +data KMeansOpts = KMeansOpts + { kmeansMethod :: Method + , kmeansSeed :: (U.Vector Word32) -- ^ Seed for random number generation + , kmeansClusters :: Bool -- ^ Wether to return clusters, may use a lot memory + } + +defaultKMeansOpts :: KMeansOpts +defaultKMeansOpts = KMeansOpts + { kmeansMethod = KMeansPP + , kmeansSeed = U.fromList [1,2,3,4,5,6,7] + , kmeansClusters = True + } -- | Results from running kmeans -data KMeans = KMeans - { _clusters :: U.Vector Int -- ^ A vector of integers (0 ~ k-1) +data KMeans a = KMeans + { membership :: U.Vector Int -- ^ A vector of integers (0 ~ k-1) -- indicating the cluster to which each -- point is allocated. - , _centers :: MU.Matrix Double -- ^ A matrix of cluster centers. + , centers :: MU.Matrix Double -- ^ A matrix of cluster centers. + , clusters :: Maybe [[a]] } deriving (Show) -- | Different initialization methods @@ -31,3 +48,4 @@ -- observations from the data set and uses these -- as the initial means. | KMeansPP -- ^ K-means++ algorithm. + | Centers (MU.Matrix Double) -- ^ Provide a set of k centroids diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/clustering-0.2.1/src/AI/Clustering/KMeans.hs new/clustering-0.3.1/src/AI/Clustering/KMeans.hs --- old/clustering-0.2.1/src/AI/Clustering/KMeans.hs 2015-06-28 23:44:21.000000000 +0200 +++ new/clustering-0.3.1/src/AI/Clustering/KMeans.hs 2016-11-11 20:59:09.000000000 +0100 @@ -1,28 +1,15 @@ {-# LANGUAGE FlexibleContexts #-} --------------------------------------------------------------------------------- --- | --- Module : AI.Clustering.KMeans --- Copyright : (c) 2015 Kai Zhang --- License : MIT --- Maintainer : [email protected] --- Stability : experimental --- Portability : portable --- --- Kmeans clustering --------------------------------------------------------------------------------- + module AI.Clustering.KMeans ( KMeans(..) + , KMeansOpts(..) + , defaultKMeansOpts , kmeans , kmeansBy - , kmeansWith -- * Initialization methods , Method(..) - -- * Useful functions - , decode - , withinSS - -- * References -- $references ) where @@ -30,7 +17,6 @@ import Control.Monad (forM_) import Control.Monad.Primitive (PrimMonad, PrimState) import qualified Data.Matrix.Unboxed as MU -import qualified Data.Matrix.Generic as MG import qualified Data.Matrix.Unboxed.Mutable as MM import Data.Ord (comparing) import qualified Data.Vector as V @@ -39,46 +25,63 @@ import qualified Data.Vector.Unboxed as U import qualified Data.Vector.Unboxed.Mutable as UM import Data.List (minimumBy, foldl') -import System.Random.MWC (Gen) +import System.Random.MWC (Gen, initialize) +import Control.Monad.ST (runST) -import AI.Clustering.KMeans.Types (KMeans(..), Method(..)) +import AI.Clustering.KMeans.Types import AI.Clustering.KMeans.Internal (sumSquares, forgy, kmeansPP) -- | Perform K-means clustering -kmeans :: (PrimMonad m, MG.Matrix mat U.Vector Double) - => Gen (PrimState m) - -> Method - -> Int - -> mat U.Vector Double - -> m KMeans -kmeans g method k mat = kmeansBy g method k dat (MG.takeRow mat) +kmeans :: Int -- ^ The number of clusters + -> MU.Matrix Double -- ^ Input data stored as rows in a matrix + -> KMeansOpts + -> KMeans (U.Vector Double) +kmeans k mat opts = KMeans member cs grps where - dat = U.enumFromN 0 $ MG.rows mat + (member, cs) = kmeans' initial dat fn + grps = if kmeansClusters opts + then Just $ decode member $ MU.toRows mat + else Nothing + dat = U.enumFromN 0 $ MU.rows mat + fn = MU.takeRow mat + initial = runST $ do + gen <- initialize $ kmeansSeed opts + case kmeansMethod opts of + Forgy -> forgy gen k dat fn + KMeansPP -> kmeansPP gen k dat fn + Centers c -> return c {-# INLINE kmeans #-} --- | K-means algorithm -kmeansBy :: (PrimMonad m, G.Vector v a) - => Gen (PrimState m) - -> Method - -> Int -- ^ number of clusters - -> v a -- ^ data stores in rows +-- | Perform K-means clustering, using a feature extraction function +kmeansBy :: G.Vector v a + => Int -- ^ The number of clusters + -> v a -- ^ Input data -> (a -> U.Vector Double) - -> m KMeans -kmeansBy g method k dat fn = do - initial <- case method of - Forgy -> forgy g k dat fn - KMeansPP -> kmeansPP g k dat fn - return $ kmeansWith initial dat fn + -> KMeansOpts + -> KMeans a +kmeansBy k dat fn opts = KMeans member cs grps + where + (member, cs) = kmeans' initial dat fn + grps = if kmeansClusters opts + then Just $ decode member $ G.toList dat + else Nothing + initial = runST $ do + gen <- initialize $ kmeansSeed opts + case kmeansMethod opts of + Forgy -> forgy gen k dat fn + KMeansPP -> kmeansPP gen k dat fn + Centers c -> return c {-# INLINE kmeansBy #-} -- | K-means algorithm -kmeansWith :: G.Vector v a - => MU.Matrix Double -- ^ initial set of k centroids - -> v a -- ^ each row represents a point - -> (a -> U.Vector Double) - -> KMeans -kmeansWith initial dat fn | d /= MU.cols initial || k > n = error "check input" - | otherwise = KMeans member centers +kmeans' :: G.Vector v a + => MU.Matrix Double -- ^ Initial set of k centroids + -> v a -- ^ Input data + -> (a -> U.Vector Double) -- ^ Feature extraction function + -> (U.Vector Int, MU.Matrix Double) +kmeans' initial dat fn + | U.length (fn $ G.head dat) /= d = error "Dimension mismatched." + | otherwise = (member, centers) where (member, centers) = loop initial U.empty loop means membership @@ -109,34 +112,35 @@ forM_ [0..d-1] $ \j -> MM.unsafeRead m (i,j) >>= MM.unsafeWrite m (i,j) . (/fromIntegral c) return m - n = G.length dat k = MU.rows initial d = MU.cols initial -{-# INLINE kmeansWith #-} +{-# INLINE kmeans' #-} -- | Assign data to clusters based on KMeans result -decode :: KMeans -> [a] -> [[a]] -decode result xs = V.toList $ V.create $ do - v <- VM.replicate n [] - forM_ (zip (U.toList membership) xs) $ \(i,x) -> +decode :: U.Vector Int -> [a] -> [[a]] +decode member xs = V.toList $ V.create $ do + v <- VM.replicate n [] + forM_ (zip (U.toList member) xs) $ \(i,x) -> VM.unsafeRead v i >>= VM.unsafeWrite v i . (x:) return v where - membership = _clusters result - n = U.maximum membership + 1 + n = U.maximum member + 1 +{-# INLINE decode #-} --- | Compute within-cluster sum of squares +{- +-- Compute within-cluster sum of squares withinSS :: KMeans -> MU.Matrix Double -> [Double] withinSS result mat = zipWith f (decode result [0 .. MU.rows mat-1]) . MU.toRows . _centers $ result where f c center = foldl' (+) 0 $ map (sumSquares center . MU.takeRow mat) c + -} -- $references -- --- Arthur, D. and Vassilvitskii, S. (2007). k-means++: the advantages of careful --- seeding. Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete --- algorithms. Society for Industrial and Applied Mathematics Philadelphia, PA, +-- Arthur, D. and Vassilvitskii, S. (2007). k-means++: the advantages of careful +-- seeding. Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete +-- algorithms. Society for Industrial and Applied Mathematics Philadelphia, PA, -- USA. pp. 1027–1035.
