On Sat, Jan 21, 2012 at 8:18 AM, Twan van Laarhoven <twa...@gmail.com> wrote:
> Notice that there are lots of "miku-X" prefixes found. This is probably not
> what you want. What exactly do you want the algorithm to do? For example, ""
> is obviously a prefix of every string, but it is not very long. On the other
> hand, each string is a prefix of itself, but that prefix is shared by only
> one string (usually).
>
> By the way, the sort and compare adjacent pairs approach corresponds to
> "atLeastThisManyDescendants 2".
Ah, now the code makes sense to me. It's longer, but it is a heck of a
lot more principled and readable, so I'm happy to replace my version
with yours. It's not too hard to convert it into a CLI filter with
optional depth (default of 2, replicating original behavior):
import qualified Data.Map as Map
import System.Environment (getArgs)
import Data.List (sortBy)
import Data.Ord (comparing)
main :: IO ()
main = do arg <- getArgs
let n = if null arg then 2 else read (head arg) :: Int
interact (unlines . chunk n . lines)
chunk :: Int -> [String] -> [String]
chunk n = map prefix . sortByLength . atLeastThisManyDescendants n . fromList
where sortByLength :: [CommonPrefix Char] -> [CommonPrefix Char]
sortByLength = sortBy (comparing (numDescendant . names))
.....
And the results seem kosher (printing just the prefixes is probably
the best idea, but wouldn't be too hard to switch to printing full
filenames - just filter the original file list with the extracted
prefix from each CommonPrefix):
$ ls music/vocaloid/| runhaskell lcp.hs 5
miku-s
miku-t
miku-r
rin-
miku-a
gumi-
luka-
$ ls music/vocaloid/| runhaskell lcp.hs 4
miku-h
miku-m
miku-n
miku-p
miku-s
miku-t
miku-r
rin-
miku-a
gumi-
luka-
$ ls music/vocaloid/| runhaskell lcp.hs # with 2
chorus-
gumi-mo
gumi-s
kaito-
luka-emon
luka-t
miku-acolorlinkingworld-
miku-akayaka
miku-cleantears-remind2011natsu-
miku-dan
miku-ele
miku-galaxyodyssey-
miku-ha
miku-inn
miku-jemappelle-motion-
miku-kz-
miku-lo
miku-m@rk-
miku-plustellia-壁の彩度-
miku-ro
miku-se
miku-ta
miku-the
miku-tinyparadise-
miku-ジラートP-birthdayofeden-
miku-杯本選
miku-般若心経
niconicochorus-
yuki-
len-
luka-di
miku-re:package-
miku-n
rin-
--
gwern
http://www.gwern.net
import qualified Data.Map as Map
import System.Environment (getArgs)
import Data.List (sortBy)
import Data.Ord (comparing)
main :: IO ()
main = do arg <- getArgs
let n = if null arg then 2 else read (head arg) :: Int
interact (unlines . chunk n . lines)
chunk :: Int -> [String] -> [String]
chunk n = map prefix . sortByLength . atLeastThisManyDescendants n . fromList
where sortByLength :: [CommonPrefix Char] -> [CommonPrefix Char]
sortByLength = sortBy (comparing (numDescendant . names))
-- A trie datatype
data Trie a = Trie { numLeafs, numDescendant :: !Int
, children :: Map.Map a (Trie a) }
instance (Show a) => Show (Trie a) where
showsPrec _ t = showString "fromList " . shows (toList t)
-- The empty trie
empty :: Trie a
empty = Trie 0 0 Map.empty
-- A trie that contains a single string
singleton :: Ord a => [a] -> Trie a
singleton [] = Trie 1 1 Map.empty
singleton (x:xs) = Trie 0 1 (Map.singleton x (singleton xs))
-- Merge two tries
merge :: Ord a => Trie a -> Trie a -> Trie a
merge (Trie l d c) (Trie l' d' c')
= Trie (l+l') (d+d') (Map.unionWith merge c c')
fromList :: Ord a => [[a]] -> Trie a
fromList = foldr (merge . singleton) empty
toList :: Trie a -> [[a]]
toList (Trie l _ c)
= replicate l []
++ [ x:xs | (x,t) <- Map.toList c, xs <- toList t ]
data CommonPrefix a = Prefix { prefix :: [a], names :: Trie a }
instance (Show a) => Show (CommonPrefix a) where
showsPrec _ (Prefix p ns) = shows p . showString " ++ " . shows (toList ns)
-- Find prefixes that have at least minD descendants.
-- when there is a prefix xs with >=minD descendants, then shorter prefixes will not be returned
atLeastThisManyDescendants :: Int -> Trie a -> [CommonPrefix a]
atLeastThisManyDescendants minD trie@(Trie _ d c)
| d < minD = [] -- too few descendants
| null forChildren = [Prefix [] trie] -- all longer prefixes have too few descendants, but this prefix doesn't
| otherwise = forChildren -- there are longer prefixes with enough descendants, return them
where
forChildren = [ Prefix (x:pfx) nms
| (x,t) <- Map.toList c
, Prefix pfx nms <- atLeastThisManyDescendants minD t ]
{- *Main> mapM_ (print . prefix) $ atLeastThisManyDescendants 4 test1
"gumi-"
"luka-"
"miku-a"
"miku-h"
"miku-m"
"miku-n"
"miku-p"
"miku-r"
"miku-s"
"miku-t"
"rin-"
test1 :: Trie Char
test1 = fromList
["chorus-kiminoshiranaimonogatari.ogg"
,"chorus-mrmusic.ogg"
,"choucho-lastnightgoodnight.ogg"
,"dylanislame-aikotoba.ogg"
,"electriclove-エレクトリック・ラブ-korskremix.ogg"
,"gumi-bacon8-justhangingaround.ogg"
,"gumi-iapologizetoyou.ogg"
,"gumi-montblanc.ogg"
,"gumi-mozaikrole.ogg"
,"gumi-ハッピーシンセサイザ.ogg"
,"gumi-showasengirl.ogg"
,"gumi-sweetfloatflatsスイートフロートアパート.ogg"
,"gumi-timewarpedafterchoppingmystagbeetle.ogg"
,"gumi-オリジナル曲-付きホシメグリ.ogg"
,"gumi-ミクオリジナル親友.ogg"
,"kaito-byakkoyano.ogg"
,"kaito-flowertail.ogg"
,"kasaneteto-tam-ochamekinou重音テト吹っ切れたおちゃめ機能.ogg"
,"len-crime-timetosaygoodbye.ogg"
,"len-fire◎flower.ogg"
,"len-ponponpon.ogg"
,"lily-prototype.ogg"
,"luka-apolxcore-waitingforyou.ogg"
,"luka-dimトロイ.ogg"
,"luka-dion-myheartwillgoon.ogg"
,"luka-dirgefilozofio-dirgeasleepinjesus.ogg"
,"luka-アゴアニキ-doubelariatダブルラリアット.ogg"
,"luka-emon-heartbeats.ogg"
,"luka-emonloid3-ハローハロー.ogg"
,"luka-everybreathyoutake.ogg"
,"luka-オリジナル-garden.ogg"
,"luka-justbefriends.ogg"
,"lukameiko-gemini.ogg"
,"luka-milkyway.ogg"
,"luka-やみくろ-かいぎ.ogg"
,"luka-tic-tick.ogg"
,"luka-torinouta.ogg"
,"luka-zeijakukei-shounenshoujo.ogg"
,"luka-勝手にアニメ-nologic-作ってみた.ogg"
,"luka-駄目人間.ogg"
,"meiko-artemis-awake.ogg"
,"miku-9ronicleプラチナ.ogg"
,"miku-acolorlinkingworld-この世界の下で.ogg"
,"miku-acolorlinkingworld-青い花.ogg"
,"miku-a+jugos-lullabyforkindness.ogg"
,"miku-akayaka-beacon.ogg"
,"miku-akayakap-sunrise.ogg"
,"miku-aoihana.ogg"
,"miku-arabianresponse.ogg"
,"miku-avtechno-tear.ogg"
,"miku-こえをきかせてcicci.ogg"
,"miku-cleantears-remind2011natsu-greenhillzonecrystiararemix.ogg"
,"miku-cleantears-remind2011natsu-夏影summerwindremix.ogg"
,"miku-clocklockworks.ogg"
,"miku-dancedancevol2-runner.ogg"
,"miku-daniwellp-chaoticuniverse.ogg"
,"miku-dixieflatline-shinonomescrumble.ogg"
,"miku-electricloveエレクトリックラヴ.ogg"
,"miku-elegumitokyo-kissmebaby.ogg"
,"miku-galaxyodyssey-cryingirl.ogg"
,"miku-galaxyodyssey-galaxyspacelines.ogg"
,"miku-hakamairi.ogg"
,"miku-haruna.ogg"
,"miku-heartshooter.ogg"
,"miku-hoshikuzutokakera.ogg"
,"miku-innes.ogg"
,"miku-innocence初音ミク.ogg"
,"miku-jemappelle-motion-likeyou.ogg"
,"miku-jemappelle-motion-ohwell.ogg"
,"miku-jevannip-myfavoritesummer.ogg"
,"miku-kakokyuudance-過呼吸ダンス.ogg"
,"miku-kz-packaged.ogg"
,"miku-kz-tellyourworld.ogg"
,"miku-lastscene.ogg"
,"miku-lostmemories付き-初音ミク.ogg"
,"miku-lovelyday.ogg"
,"miku-いいわけlove_song.ogg"
,"mikulukagumi-prayfor.ogg"
,"miku-maple-初音ミク楓-オリジナル曲.ogg"
,"miku-more1.5.ogg"
,"mik...@rk-eklosion.ogg"
,"mik...@rk-kirch.ogg"
,"miku-nana-ボーナストラック-ハッピー般若コア.ogg"
,"miku-nekomimiswitch.ogg"
,"miku-nightrainbow.ogg"
,"miku-noyounome.ogg"
,"miku-むかしむかしのきょうのぼくオリジナル.ogg"
,"miku-pandolistp-neverendinghammertime.ogg"
,"miku-ジラートP-birthdayofeden-deepsleep.ogg"
,"miku-ジラートP-birthdayofeden-水中読書.ogg"
,"miku-plustellia-dear.ogg"
,"miku-plustellia-壁の彩度-crazygirl.ogg"
,"miku-plustellia-壁の彩度-discoradio.ogg"
,"miku-ぽわぽわP-ストロボライト.ogg"
,"miku-rabbitforgets.ogg"
,"miku-re:package-lastnightgoodnight.ogg"
,"miku-re:package-ourmusic.ogg"
,"miku-re:package-sutorobonaitsu.ogg"
,"miku-rollinggirl.ogg"
,"miku-ryo-メルト-melt.ogg"
,"miku-senseiniitteyaro.ogg"
,"miku-sevencolors-レモネード.ogg"
,"miku-shoukinosatadenia.ogg"
,"miku-stratosphere.ogg"
,"miku-supernova.ogg"
,"miku-tam-lastnightgoodnight.ogg"
,"miku-tanatofobia.ogg"
,"miku-thearmyforyourenvy-スーパー・ノヴァ.ogg"
,"miku-theendlesslove.ogg"
,"miku-tinyparadise-snowflake.ogg"
,"miku-tinyparadise-tinyparadise.ogg"
,"miku-unfragment.ogg"
,"miku-worldismine-ルドイズマイン.ogg"
,"miku-yakiimo.ogg"
,"miku-文学少年の憂鬱-オリジナル.ogg"
,"miku-カラフルポップビートオリジナル曲.ogg"
,"miku-杯本選life.ogg"
,"miku-杯本選初音ミクどういうことなのダンス.ogg"
,"miku-般若心経beautyfloor-buddhamix.ogg"
,"miku-般若心経ポップ.ogg"
,"niconicochorus-blackrockshooter.ogg"
,"niconicochorus-justbefriends.ogg"
,"rin-dixieflatline-gemini.ogg"
,"rin-elegumitokyo-二人、恋してgirlsside.ogg"
,"rin-helloworld.ogg"
,"rin-jutenija.ogg"
,"rin-lastnightgoodnight.ogg"
,"rin-ripples-evergreen.ogg"
,"rin-っ´ω`c.ogg"
,"rollinggirl-piano.ogg"
,"seeu-gagain-따라리라ddadada.ogg"
,"utau-雪歌ユフbeyondオリジナル曲.ogg"
,"yuki-discochocolatheque.ogg"
,"yuki-shouwasenhosiga^ru.ogg"
,"yuki-shouwasenhosiga^ru.ogg"]
-}
_______________________________________________
Haskell-Cafe mailing list
Haskell-Cafe@haskell.org
http://www.haskell.org/mailman/listinfo/haskell-cafe