Hi folks,

I'm competing in a contest at work, and we're allowed to use whatever language we want. I decided this was my chance to prove to people that Haskell was up to the challenge. Unfortunately, I ran into performance problems. Since the contest ends this Friday, I've decided to switch to C++ (gasp!). But if any of you have advice on how to speed up this code, it could help me advocate Haskell in the future.

It's supposed to match movie titles from an imported database to a reference database. The version I've sent doesn't do anything very smart - it's just doing literal title matches. The first argument to the program is the filename of the table to be imported, and the second is the filename of the reference table. The first line of each table is a pipe-separated list of field names; the rest of the lines are records, each a pipe-separated list of values.

The import files each have 3,000 records, and the reference table has 137,986 records.

Building the hash tables out of the files is quick - it just takes a few seconds. But doing the matching of title_id in one table to title_id in the other, in a nested loop between both tables, takes way too long. It's matching two import titles (against each of the reference titles) per second. It needs to do at least 20 per second to qualify for the contest, and it's not doing anything fancy yet.

I tried various "improvements" to speed it up. One was to specifically use ByteString, eliminating the AbsString class. Didn't make a difference. Another was to use arrays instead of lists to store each record, and precompute the indices of each of the fields within those records. I also iterated over a list of keys instead of the list of Maps, and only converted each record to a Map one at a time, hoping they would be disposed of sooner. Instead of speeding up the program, this slowed it down by a factor of 20!

I've profiled it, and I can't make much out of that. It seemed to be spending 25% of its time doing scoring, and I though the problem must be due to laziness, but I'm not sure.

So if anyone has any ideas how to speed this up by a factor of at least 10 times, it would be really appreciated! Even the Ruby solutions are doing that, which is embarrassing.

Thanks,
Lyle
{-# OPTIONS_GHC -fglasgow-exts #-}

-- AbsString.hs
-- An abstract string class, which makes it easier to switch string representations.

module AbsString where

import Prelude as P
import Data.ByteString.Base (c2w, w2c)
import Data.ByteString.Char8 as BSC
import Data.ByteString.Lazy.Char8 as BSLC
import Text.Regex as RE
import Test.HUnit

class (Eq s, Ord s) => AbsString s where
    s :: String -> s
    toString :: s -> String
    sLength :: s -> Int
    sAppend :: s -> s -> s
    sConcat :: [s] -> s
    putStr :: s -> IO ()
    putStrLn :: s -> IO ()
    readFile :: String -> IO s
    lines :: s -> [s]
    split :: Char -> s -> [s]

(+++) :: AbsString s => s -> s -> s
(+++) = sAppend

instance AbsString String where
    s = id
    toString = id
    sLength = P.length
    sAppend = (++)
    sConcat = P.concat
    putStr = P.putStr
    putStrLn = P.putStrLn
    readFile fn = P.readFile fn
    lines = P.lines
    split c = RE.splitRegex (mkRegex (escapeRegexChar c))

regexMetaChars = "\\|()[]^.*+?{}"

escapeRegexChar :: Char -> String
escapeRegexChar c = if c `P.elem` regexMetaChars then "\\"++[c] else [c]

instance AbsString BSC.ByteString where
    s = BSC.pack
    toString = BSC.unpack
    sLength = fromIntegral . BSC.length
    sAppend = BSC.append
    sConcat = BSC.concat
    putStr = BSC.putStr
    putStrLn = BSC.putStrLn
    readFile = BSC.readFile
    lines = BSC.lines
    split = BSC.split

instance AbsString BSLC.ByteString where
    s = BSLC.pack
    toString = BSLC.unpack
    sLength = fromIntegral . BSLC.length
    sAppend = BSLC.append
    sConcat = BSLC.concat
    putStr = BSLC.putStr
    putStrLn = BSLC.putStrLn
    readFile = BSLC.readFile
    lines = BSLC.lines
    split c s = BSLC.split c s

test_showString = TestCase $ do
  let aStr = s "Hello there" :: String
  assertEqual "" "Hello there" (toString aStr)

test_showByteString = TestCase $ do
  let aStr = s "Hello there" :: BSC.ByteString
  assertEqual "" "Hello there" (toString aStr)

test_showByteStringLazy = TestCase $ do
  let aStr = s "Hello there" :: BSLC.ByteString
  assertEqual "" "Hello there" (toString aStr)

runTests = runTestTT $
           TestList [TestLabel "test_showString" test_showString,
                     TestLabel "test_showByteString" test_showByteString,
                     TestLabel "test_showByteStringLazy" test_showByteStringLazy]
{-# OPTIONS_GHC -fglasgow-exts #-}

-- TextTable.hs
-- Defines a TextTable type, which defines a map of strings to a record
-- of text fields.

module TextTable(TextTable(..),TextRecord,makeTable,lookupRecord,listKeys,listRecords) where

import Prelude hiding (putStr,putStrLn,readFile,lines)
import qualified Data.ByteString.Lazy as BSL (ByteString)
import Data.Array.Unboxed
import Data.HashTable (HashTable)
import Data.List hiding (lines)
import Data.Map (Map,fromList)
import qualified Data.HashTable as HT
import Test.HUnit
import AbsString

data TextTable s = TextTable { tableFields :: ![s],
                               keyFieldIndex :: !Int,
                               tableRecords :: !(HashTable s (Array Int s)) }

type TextRecord s = Map s s

makeTable :: AbsString s => String -> String -> s -> IO (TextTable s)
makeTable fileName keyField text = do
  putStrLn ("Reading table from '" ++ fileName ++ "'...")
  let (headerLine:recordLines) = lines text
  let fields = split '|' headerLine
  let nFields = length fields
  let (Just keyIndex) = elemIndex (s keyField) fields
  let records = map (split '|') recordLines
  putStrLn "Fields:"
  mapM_ (\f -> putStr (if (f==s keyField) then "* " else "  ") >> putStr f >> putStr "\n") fields
  table <- HT.new (==) (HT.hashString . toString)
  putStrLn "Reading records..."
  sequence_ [do if (i `mod` 100 == 0) then putStr "." else return ()
                if (i `mod` 5000 == 0) then putStr (show i ++ "\n") else return ()
                if not (null r) then HT.insert table (r !! keyIndex) (listArray (0,nFields-1) r) else return ()
             | (i,r) <- zip [1..] records]
  putStrLn "\nDone reading table."
  return (TextTable fields keyIndex table)

lookupRecord :: AbsString s => TextTable s -> s -> IO (Maybe (TextRecord s))
lookupRecord (TextTable fields _ records) keyValue = do
  maybeRecord <- HT.lookup records keyValue
  return $ do record <- maybeRecord
              return (fromList (zip fields (elems record)))

listKeys :: AbsString s => TextTable s -> IO [s]
listKeys (TextTable fields keyField records) = do
  keyRecs <- HT.toList records
  return $ map fst keyRecs

listRecords :: AbsString s => TextTable s -> IO [TextRecord s]
listRecords (TextTable fields _ records) = do
  keyRecs <- HT.toList records
  return $ map (fromList . zip fields . elems . snd) keyRecs

test_readTable = TestCase $ do
  let text = "name|id\n"
             ++ "Warts and All|324\n"
             ++ "Rose Garden|123\n"
  table <- makeTable "<test>" "id" (s text :: BSL.ByteString)
  maybeRecord <- lookupRecord table (s "123")
  assertEqual "" (Just (fromList [(s "name", s "Rose Garden"),(s "id", s "123")])) maybeRecord
  
runTests = runTestTT $ TestList [TestLabel "test_readTable" test_readTable]
-- title_match.hs

import Prelude hiding (putStr,putStrLn,readFile,lines)
import qualified Data.ByteString as BS
import Control.Exception
import Data.Map ((!))
import Data.Maybe
import System.Environment
import System.IO.Error
import System.Time
import Test.HUnit
import AbsString
import TextTable

main = do
  args <- getArgs
  ensure (length args > 1) "Usage: title_match table_file_a table_file_b"
  let keyField = "title_id"
  let fileNameA = (args !! 0)
  fileTextA <- readFile fileNameA
  tableA <- makeTable fileNameA keyField (fileTextA :: BS.ByteString)
  let fileNameB = (args !! 1)
  fileTextB <- readFile fileNameB
  tableB <- makeTable fileNameB keyField (fileTextB :: BS.ByteString)
  matchTables tableA tableB
  return ()

data Match = Exact String String | Partial String [String]

instance Show Match where
    show (Exact sa sb) = "EXACT" ++ "|" ++ sa ++ "|" ++ sb
    show (Partial sa sbs) = "PARTIAL" ++ "|" ++ sa ++ concat ["|" ++ sb | sb <- sbs]

matchTables :: AbsString s => TextTable s -> TextTable s -> IO ()
matchTables tableA tableB = do
  aRecs <- listRecords tableA
  bRecs <- listRecords tableB
  sequence_ [do time <- getTimeStr; putStrLn ("Matching title " ++ (toString (aRec ! (s "title_id"))) ++ " at time " ++ time) ; sequence_ [if matchScore bRec aRec > 0 then putStrLn ("  Exact match between " ++ (toString (aRec ! (s "title_id"))) ++ " [" ++ (toString (aRec ! (s "muze_title_id"))) ++ "] and " ++ (toString (bRec ! (s "title_id"))) ++ " where title=\"" ++ (toString (aRec ! (s "title"))) ++ "\"") else return () | bRec <- bRecs] | aRec <- aRecs]
  
getTimeStr :: IO String
getTimeStr = do
  clockTime <- getClockTime
  calendarTime <- toCalendarTime clockTime
  return $! calendarTimeToString calendarTime

titleField :: AbsString s => s
titleField = s "title"

-- Score a match between two records - currently just an exact match
matchScore :: AbsString s => TextRecord s -> TextRecord s -> Int
matchScore r1 r2 = if r1 ! titleField == r2 ! titleField then 1 else 0

ensure x s = if x then return () else ioError (userError s)

getRecord :: AbsString s => TextTable s -> s -> IO (TextRecord s)
getRecord table keyValue = do
  maybeRecord <- lookupRecord table keyValue
  ensure (isJust maybeRecord) ("Record \"" ++ (toString keyValue) ++ "\" not found in table")
  return $ fromJust maybeRecord
_______________________________________________
Haskell-Cafe mailing list
[email protected]
http://www.haskell.org/mailman/listinfo/haskell-cafe

Reply via email to