Hi folks,
I'm competing in a contest at work, and we're allowed to use whatever
language we want. I decided this was my chance to prove to people that
Haskell was up to the challenge. Unfortunately, I ran into performance
problems. Since the contest ends this Friday, I've decided to switch to
C++ (gasp!). But if any of you have advice on how to speed up this code,
it could help me advocate Haskell in the future.
It's supposed to match movie titles from an imported database to a
reference database. The version I've sent doesn't do anything very smart
- it's just doing literal title matches. The first argument to the
program is the filename of the table to be imported, and the second is
the filename of the reference table. The first line of each table is a
pipe-separated list of field names; the rest of the lines are records,
each a pipe-separated list of values.
The import files each have 3,000 records, and the reference table has
137,986 records.
Building the hash tables out of the files is quick - it just takes a few
seconds. But doing the matching of title_id in one table to title_id in
the other, in a nested loop between both tables, takes way too long.
It's matching two import titles (against each of the reference titles)
per second. It needs to do at least 20 per second to qualify for the
contest, and it's not doing anything fancy yet.
I tried various "improvements" to speed it up. One was to specifically
use ByteString, eliminating the AbsString class. Didn't make a
difference. Another was to use arrays instead of lists to store each
record, and precompute the indices of each of the fields within those
records. I also iterated over a list of keys instead of the list of
Maps, and only converted each record to a Map one at a time, hoping they
would be disposed of sooner. Instead of speeding up the program, this
slowed it down by a factor of 20!
I've profiled it, and I can't make much out of that. It seemed to be
spending 25% of its time doing scoring, and I though the problem must be
due to laziness, but I'm not sure.
So if anyone has any ideas how to speed this up by a factor of at least
10 times, it would be really appreciated! Even the Ruby solutions are
doing that, which is embarrassing.
Thanks,
Lyle
{-# OPTIONS_GHC -fglasgow-exts #-}
-- AbsString.hs
-- An abstract string class, which makes it easier to switch string representations.
module AbsString where
import Prelude as P
import Data.ByteString.Base (c2w, w2c)
import Data.ByteString.Char8 as BSC
import Data.ByteString.Lazy.Char8 as BSLC
import Text.Regex as RE
import Test.HUnit
class (Eq s, Ord s) => AbsString s where
s :: String -> s
toString :: s -> String
sLength :: s -> Int
sAppend :: s -> s -> s
sConcat :: [s] -> s
putStr :: s -> IO ()
putStrLn :: s -> IO ()
readFile :: String -> IO s
lines :: s -> [s]
split :: Char -> s -> [s]
(+++) :: AbsString s => s -> s -> s
(+++) = sAppend
instance AbsString String where
s = id
toString = id
sLength = P.length
sAppend = (++)
sConcat = P.concat
putStr = P.putStr
putStrLn = P.putStrLn
readFile fn = P.readFile fn
lines = P.lines
split c = RE.splitRegex (mkRegex (escapeRegexChar c))
regexMetaChars = "\\|()[]^.*+?{}"
escapeRegexChar :: Char -> String
escapeRegexChar c = if c `P.elem` regexMetaChars then "\\"++[c] else [c]
instance AbsString BSC.ByteString where
s = BSC.pack
toString = BSC.unpack
sLength = fromIntegral . BSC.length
sAppend = BSC.append
sConcat = BSC.concat
putStr = BSC.putStr
putStrLn = BSC.putStrLn
readFile = BSC.readFile
lines = BSC.lines
split = BSC.split
instance AbsString BSLC.ByteString where
s = BSLC.pack
toString = BSLC.unpack
sLength = fromIntegral . BSLC.length
sAppend = BSLC.append
sConcat = BSLC.concat
putStr = BSLC.putStr
putStrLn = BSLC.putStrLn
readFile = BSLC.readFile
lines = BSLC.lines
split c s = BSLC.split c s
test_showString = TestCase $ do
let aStr = s "Hello there" :: String
assertEqual "" "Hello there" (toString aStr)
test_showByteString = TestCase $ do
let aStr = s "Hello there" :: BSC.ByteString
assertEqual "" "Hello there" (toString aStr)
test_showByteStringLazy = TestCase $ do
let aStr = s "Hello there" :: BSLC.ByteString
assertEqual "" "Hello there" (toString aStr)
runTests = runTestTT $
TestList [TestLabel "test_showString" test_showString,
TestLabel "test_showByteString" test_showByteString,
TestLabel "test_showByteStringLazy" test_showByteStringLazy]
{-# OPTIONS_GHC -fglasgow-exts #-}
-- TextTable.hs
-- Defines a TextTable type, which defines a map of strings to a record
-- of text fields.
module TextTable(TextTable(..),TextRecord,makeTable,lookupRecord,listKeys,listRecords) where
import Prelude hiding (putStr,putStrLn,readFile,lines)
import qualified Data.ByteString.Lazy as BSL (ByteString)
import Data.Array.Unboxed
import Data.HashTable (HashTable)
import Data.List hiding (lines)
import Data.Map (Map,fromList)
import qualified Data.HashTable as HT
import Test.HUnit
import AbsString
data TextTable s = TextTable { tableFields :: ![s],
keyFieldIndex :: !Int,
tableRecords :: !(HashTable s (Array Int s)) }
type TextRecord s = Map s s
makeTable :: AbsString s => String -> String -> s -> IO (TextTable s)
makeTable fileName keyField text = do
putStrLn ("Reading table from '" ++ fileName ++ "'...")
let (headerLine:recordLines) = lines text
let fields = split '|' headerLine
let nFields = length fields
let (Just keyIndex) = elemIndex (s keyField) fields
let records = map (split '|') recordLines
putStrLn "Fields:"
mapM_ (\f -> putStr (if (f==s keyField) then "* " else " ") >> putStr f >> putStr "\n") fields
table <- HT.new (==) (HT.hashString . toString)
putStrLn "Reading records..."
sequence_ [do if (i `mod` 100 == 0) then putStr "." else return ()
if (i `mod` 5000 == 0) then putStr (show i ++ "\n") else return ()
if not (null r) then HT.insert table (r !! keyIndex) (listArray (0,nFields-1) r) else return ()
| (i,r) <- zip [1..] records]
putStrLn "\nDone reading table."
return (TextTable fields keyIndex table)
lookupRecord :: AbsString s => TextTable s -> s -> IO (Maybe (TextRecord s))
lookupRecord (TextTable fields _ records) keyValue = do
maybeRecord <- HT.lookup records keyValue
return $ do record <- maybeRecord
return (fromList (zip fields (elems record)))
listKeys :: AbsString s => TextTable s -> IO [s]
listKeys (TextTable fields keyField records) = do
keyRecs <- HT.toList records
return $ map fst keyRecs
listRecords :: AbsString s => TextTable s -> IO [TextRecord s]
listRecords (TextTable fields _ records) = do
keyRecs <- HT.toList records
return $ map (fromList . zip fields . elems . snd) keyRecs
test_readTable = TestCase $ do
let text = "name|id\n"
++ "Warts and All|324\n"
++ "Rose Garden|123\n"
table <- makeTable "<test>" "id" (s text :: BSL.ByteString)
maybeRecord <- lookupRecord table (s "123")
assertEqual "" (Just (fromList [(s "name", s "Rose Garden"),(s "id", s "123")])) maybeRecord
runTests = runTestTT $ TestList [TestLabel "test_readTable" test_readTable]
-- title_match.hs
import Prelude hiding (putStr,putStrLn,readFile,lines)
import qualified Data.ByteString as BS
import Control.Exception
import Data.Map ((!))
import Data.Maybe
import System.Environment
import System.IO.Error
import System.Time
import Test.HUnit
import AbsString
import TextTable
main = do
args <- getArgs
ensure (length args > 1) "Usage: title_match table_file_a table_file_b"
let keyField = "title_id"
let fileNameA = (args !! 0)
fileTextA <- readFile fileNameA
tableA <- makeTable fileNameA keyField (fileTextA :: BS.ByteString)
let fileNameB = (args !! 1)
fileTextB <- readFile fileNameB
tableB <- makeTable fileNameB keyField (fileTextB :: BS.ByteString)
matchTables tableA tableB
return ()
data Match = Exact String String | Partial String [String]
instance Show Match where
show (Exact sa sb) = "EXACT" ++ "|" ++ sa ++ "|" ++ sb
show (Partial sa sbs) = "PARTIAL" ++ "|" ++ sa ++ concat ["|" ++ sb | sb <- sbs]
matchTables :: AbsString s => TextTable s -> TextTable s -> IO ()
matchTables tableA tableB = do
aRecs <- listRecords tableA
bRecs <- listRecords tableB
sequence_ [do time <- getTimeStr; putStrLn ("Matching title " ++ (toString (aRec ! (s "title_id"))) ++ " at time " ++ time) ; sequence_ [if matchScore bRec aRec > 0 then putStrLn (" Exact match between " ++ (toString (aRec ! (s "title_id"))) ++ " [" ++ (toString (aRec ! (s "muze_title_id"))) ++ "] and " ++ (toString (bRec ! (s "title_id"))) ++ " where title=\"" ++ (toString (aRec ! (s "title"))) ++ "\"") else return () | bRec <- bRecs] | aRec <- aRecs]
getTimeStr :: IO String
getTimeStr = do
clockTime <- getClockTime
calendarTime <- toCalendarTime clockTime
return $! calendarTimeToString calendarTime
titleField :: AbsString s => s
titleField = s "title"
-- Score a match between two records - currently just an exact match
matchScore :: AbsString s => TextRecord s -> TextRecord s -> Int
matchScore r1 r2 = if r1 ! titleField == r2 ! titleField then 1 else 0
ensure x s = if x then return () else ioError (userError s)
getRecord :: AbsString s => TextTable s -> s -> IO (TextRecord s)
getRecord table keyValue = do
maybeRecord <- lookupRecord table keyValue
ensure (isJust maybeRecord) ("Record \"" ++ (toString keyValue) ++ "\" not found in table")
return $ fromJust maybeRecord
_______________________________________________
Haskell-Cafe mailing list
[email protected]
http://www.haskell.org/mailman/listinfo/haskell-cafe