-- based on http://jtauber.com/blog/2008/02/10/a_new_kind_of_graded_reader/
-- TODO: read knownwords from file
-- print out matching sentences as well (make optional)
-- fix performance; goal: handle Frank Herbert corpus in under 5 minutes
import Data.Ord
import Data.Char (isPunctuation, toLower)
import Data.List -- (nub, sort)
import qualified Data.Set as Set
import qualified Data.Map as Map
import Control.Parallel.Strategies
import Data.Function (on)
import Data.List.Split (splitWhen)
import System.IO.UTF8 (getContents, putStrLn)
import System.Environment (getArgs)
main :: IO ()
main = do depth - fmap (read . head) $ getArgs
corpus - System.IO.UTF8.getContents
let pcorpus = processCorpus corpus
let knownwords = map (map toLower) [You, dont, see, more, than, that, The, first, episode, of, Kare, Kano, is, rotten, with, Evangelion, visual, motifs, the, trains, the, spotlights, and, telephone, poles, and, wires, the, masks, and, this, is, how, everyone, sees, me, etc, a, it, did, are, to, in, I, Dune, was, Stalin, Mussolini, Hitler, Churchill, beginning, That, all, be, like, on, an, Its, But, only, you, themes, into, as, my, human, paradox,he,said,paul,his,she,her,not,him,had,for,at,alia,no,from,what,asked,they,there,have,stilgar]
let optimalwords = answer depth pcorpus knownwords
System.IO.UTF8.putStrLn optimalwords
-- | Clean up. Don't want 'Je suis. to look different from Je suis...
--
-- stringPunctuation Greetings, fellow human flesh-sacks! ~ Greetings fellow human fleshsacks
stripPunctuation :: String - String
stripPunctuation = filter (not . isPunctuation)
-- Turn a single big document into a stream of sentences of individual words; lower-case so we don't get
-- multiple hits for 'He', 'he' etc
processCorpus :: String - [[String]]
processCorpus = map (sort . words . stripPunctuation) . splitWhen (=='.') . map toLower
sentences :: (NFData a, Ord a) = [[a]] - Map.Map Int (Set.Set a)
sentences = Map.fromList . zip [(0::Int)..] . map Set.fromList
fidiv :: (Integral a, Fractional b) = a - a - b
fidiv = (/) `on` fromIntegral
ranks :: (NFData v, Ord k, Ord v) = Map.Map k (Set.Set v) - [(v, Rational)]
ranks s = Map.toList .
Map.fromListWith (+) $
[(word, rank) |
(_sentenceId, wrds) - Map.toList s,
let rank = 1 `fidiv` Set.size wrds,
word - Set.toList wrds]
approximation :: (NFData v, Ord k, Ord v) = Map.Map k (Set.Set v) - Int - [v]
approximation _ 0 = []
approximation s n =
case ranks s of
[] - []
xs - let word = fst . maximumBy (comparing snd) $ xs in
let withoutWord = Map.map (Set.delete word) s
in word : approximation withoutWord (n-1)
process :: (Ord v, NFData v) = [[v]] - [Int] - [[v]]
process ss ns = map (approximation $ sentences ss) ns
getBest :: [Int] -[[String]] - String
getBest x y = unlines . last $ process y x
filterKnown :: [String] - [[String]] - [[String]]
filterKnown known = filter (not . null) . map (filter (flip notElem $ known))
answer :: Int - [[String]] - [String] - String
answer depth corp known = let corp' = filterKnown known corp in getBest [1..depth] corp'___
Haskell-Cafe mailing list
Haskell-Cafe@haskell.org
http://www.haskell.org/mailman/listinfo/haskell-cafe