Yitzchak Gale wrote: > The API for parsing and rendering time in Data.Time is > based on the standard API for that in C - like the libraries > in most languages. It's pretty standard stuff. > > I'm sure it can be improved upon though. If you have a useful > alternative time parsing library, please release it on Hackage.
In the perl world, there is a Date::Parse library that I have found very useful in many situations. It can parse any date format you throw at it, fairly reliably[1]. The closest I have in Haskell is the attached module, which I used to parse all dates in all Usenet posts between 1981 and 1991. While small portions are specific to Usenet messages it does have quite a lot of generic parsing code, and perhaps most usefully, a large number of date format strings, and I have thought about making a library if I find time. (Or anyone is welcome to take it and do so.) It's based on Data.Time.Parse instead of Data.Time.Format.parseTime, probably because I didn't notice the latter when I was writing it. -- see shy jo [1] m/d/y vs d/m/y being the main weakness of course
{- Date parsing. The really horribly ugly bit, as there were few standards
- on the early usenet. -}
module MsgTime where
import Data.Time.Parse
import Data.String.Utils
import Data.Maybe
import Data.Time.LocalTime
import Data.Time.Clock.POSIX
import Data.Char
import Msg
type TimeZoneParser = String -> Maybe (TimeZone, String)
type DateParser = String -> Maybe (LocalTime, String)
data TimeParser = TimeParser {
dateparser :: DateParser,
timezoneparser :: TimeZoneParser,
example :: String
}
instance Show TimeParser where
show (TimeParser _ _ e) = "TimeParser " ++ e
extractTime :: Msg -> Either String POSIXTime
extractTime m
| null date = case parseTime mid of
Left _ -> Left "missing Date header"
Right t -> Right t
| otherwise = case parseTime date of
Left e -> Left e
Right t ->
if t >= earliestSane
then Right t
else parseTime dateR -- try this instead
where
date = getHeader m (Header "Date")
dateR = getHeader m (Header "Date-Received")
mid = getHeader m (Header "Message-Id")
parseTime :: String -> Either String POSIXTime
parseTime s
| null matches = Left $ "cannot parse: " ++ s
| otherwise = Right $ matches !! 0
where
matches = catMaybes $ map (applyParser s) parsers
{- Applies a date parser and a timezone parser to the string, only
- succeeding if the entire string is consumed. -}
applyParser :: String -> TimeParser -> Maybe POSIXTime
applyParser s (TimeParser dp zp _) =
case dp (strip s) of
Nothing -> Nothing
Just (t, s') ->
case zp (map toLower $ strip s') of
Just (z, []) -> Just $ toPOSIXTime $ ZonedTime t z
_ -> Nothing
earliestSane :: POSIXTime
earliestSane = toPOSIXTime $
ZonedTime (fst $ fromJust $ strptime fmt cutoff) Data.Time.LocalTime.utc
where
cutoff = "1980-01-01 01:01:01"
fmt = "%Y-%m-%d %T"
{- All the date formats you can shake a stick at.. and then some! -}
parsers :: [TimeParser]
parsers =
[ p anyzone "%d %b %y %T" "15 Jun 88 02:27:41 GMT"
, p anyzone "%a, %d %b %y %T" "Thu, 22 Jun 89 20:02:03 GMT"
, p anyzone "%a, %d-%b-%y %T" "Thu, 15-Jun-89 18:01:56 EDT"
, p anyzone "%d %b %y %T" "8 Jan 90 14:07:27 -0400"
, p anyzone "%d %b %y %H:%M" "4 Oct 89 19:56 GMT"
, p anyzone "%a, %d %b %y %H:%M" "Thu, 23 May 91 02:13 PDT"
, p anyzone "%a, %d %b %Y %T" "Thu, 23 May 1991 07:07:00 -0400"
, p anyzone "%a, %d %b %Y %H:%M" "Sat, 18 May 1991 17:28 CDT"
, p anyzone "%d %b %Y %T" "11 Apr 1991 12:02:01 GMT"
, p anyzone "%d-%b-%y %H:%M" "24-Mar-90 14:22 CST"
, p anyzone "%d %b %y, %T" "22 May 91, 16:31:37 EST"
, p anyzone "%a, %d %b T %T" "Fri, 8 Feb T 09:49:39 EST"
, p anyzone "%d %b %Y %H:%M" "30 June 1991 17:15 -0400"
-- special cases
, p (tzconst est) "%a %b %d %T EST %Y" "Tue Jan 11 12:44:36 EST 1983"
, p (tzconst est) "%a %b %d %T EST %y" "Tue Jan 11 12:44:36 EST 83"
, p (tzconst edt) "%a %b %d %T EDT %Y" "Tue Jan 11 12:44:36 EDT 1983"
, p (tzconst edt) "%a %b %d %T EDT %y" "Tue Jan 11 12:44:36 EDT 83"
, p (tzconst utc) "%a %b %d %T GMT %Y" "Thu Nov 1 23:14:37 GMT 1990"
, p (tzconst pdt) "%d %b %y %T -7" "11 Jun 91 15:41:21 -7"
-- dates with no timezone specified are guessed
, p nozone "%d %b %y %T" "9 Jan 90 09:33:59"
, p nozone "%d %b %Y %T" "10 APR 1990 05:25:28"
, p nozone "%a %b %d %T %Y" "Fri Feb 6 00:19:47 1981"
, p nozone "%a %b %d %T %y" "Fri Feb 6 00:19:47 81"
, p nozone "%Y-%m-%d %T" "1981-11-12 18:31:01"
, p nozone "%y-%m-%d %T" "81-11-12 18:31:01"
, p nozone "%a, %d %b %y %T" "Sat, 13 Apr 91 08:37:57"
, p nozone "%a, %d %b %Y %T" "Sun, 16 Jun 1991 13:23:02"
, p nozone "%d %b, %Y %T" "1 May, 1991 00:00:00"
, p nozone "%d %b %y %H:%M" "8 Jan 88 18:03"
, p nozone "%a, %d %b %y %H:%M" "Wed, 29 May 91 17:14"
, p nozone "1 %b %d %T %Y" "1 Jan 08 20:59:08 1991"
-- this has to come near the end, as it matches greedily
, g nozone "%a %b %d %T %Y (" "Wed Oct 27 17:02:46 1982 (Tuesday)"
, g nozone "%a, %d %b %y %T +" "Tue, 21 May 91 16:46:01 +22323328"
-- extract date from message-id headers
-- (used for messages with no Date field)
, g nozone "<%Y%b%d.%H%M%S." "<[email protected]>"
]
where
p z f e = TimeParser (strptime f) z e
g z f e = TimeParser (greedystrptime f) z e
{- ignores trailing garbage -}
greedystrptime :: String -> DateParser
greedystrptime f s =
case strptime f s of
Nothing -> Nothing
Just (t, _) -> Just (t, "")
anyzone :: TimeZoneParser
anyzone "utc" = retzone utc
anyzone "ut" = retzone utc
anyzone "gmt" = retzone utc
anyzone "edt" = retzone edt
anyzone "est" = retzone est
anyzone "cdt" = retzone cdt
anyzone "cst" = retzone cst
anyzone "mdt" = retzone mdt
anyzone "mst" = retzone mst
anyzone "pdt" = retzone pdt
anyzone "pst" = retzone pst
anyzone "hst" = retzone hst
anyzone "ast" = retzone ast -- ambiguous time zone, but alaska predominates
anyzone "nzt" = retzone $ hoursToTimeZone 12
anyzone "nzst" = anyzone "nzt"
anyzone "nzdt" = retzone $ hoursToTimeZone 13
anyzone "nzd" = anyzone "nzdt"
anyzone "nzs" = anyzone "nzt"
-- ambiguous but israel predominates
anyzone "ist" = retzone $ hoursToTimeZone 2
-- could also be Burma time, but checked some messages
-- and it was used by British
anyzone "bst" = retzone $ hoursToTimeZone 1
anyzone "cet" = retzone $ hoursToTimeZone 1
anyzone "wet" = retzone utc
anyzone "met" = anyzone "cet"
anyzone "cest" = retzone $ hoursToTimeZone 2
anyzone "mest" = anyzone "cet"
anyzone "mes" = anyzone "cet"
anyzone "mez" = anyzone "cet"
anyzone "tur" = retzone $ hoursToTimeZone 2 -- turkey
anyzone "lcl" = retzone est -- unknown
anyzone "dst" = retzone est -- unknown
anyzone "n" = retzone est -- unknown
anyzone "swe" = anyzone "cet" -- something swedish
anyzone "plt" = anyzone "pst" -- probably a typo
anyzone "u" = anyzone "utc" -- probably a typo
anyzone "brt" = retzone $ hoursToTimeZone (-3)
anyzone (sign:h1:h2:h3:h4:rest) = zoneOffset sign [h1,h2,h3,h4] rest
anyzone (sign:h1:h2:h3:rest) = zoneOffset sign [h1,h2,h3] rest
anyzone (sign:h1:h2:rest) = zoneOffset sign [h1,h2] rest
anyzone (sign:h1:rest) = zoneOffset sign [h1] rest
anyzone _ = Nothing
zoneOffset :: Char -> String -> TimeZoneParser
zoneOffset sign num rest =
case (sign, all isDigit num) of
('-', True) -> res (-1)
('+', True) -> res 1
_ -> Nothing
where
-- XXX non-hour timezone offsets ignored
res mult = Just (hoursToTimeZone $ mult * read num, rest)
{- Adds a US/Eastern timezone offset to a time.
-
- When this is used, no actual time zone is known; we're just guessing.
- Much Usenet traffic was on the East coast of the US; while some was
- in California. In this case, it's better to guess, even if the
- guess is wrong by 3 hours, than to leave a bad default assumption of UTC.
-}
nozone :: TimeZoneParser
nozone = tzconst est
est :: TimeZone
est = hoursToTimeZone (-5)
edt :: TimeZone
edt = hoursToTimeZone (-4)
cdt :: TimeZone
cdt = hoursToTimeZone (-5)
cst :: TimeZone
cst = hoursToTimeZone (-6)
mdt :: TimeZone
mdt = hoursToTimeZone (-6)
mst :: TimeZone
mst = hoursToTimeZone (-7)
pdt :: TimeZone
pdt = hoursToTimeZone (-7)
pst :: TimeZone
pst = hoursToTimeZone (-8)
ast :: TimeZone
ast = hoursToTimeZone (-9)
hst :: TimeZone
hst = hoursToTimeZone (-10)
{- A timezone parser that sets a constant time zone, requiring the date
- parse to have matched the whole string. -}
tzconst :: TimeZone -> TimeZoneParser
tzconst z "" = retzone z
tzconst _ _ = Nothing
retzone :: TimeZone -> Maybe (TimeZone, String)
retzone z = Just (z, "")
toPOSIXTime :: ZonedTime -> POSIXTime
toPOSIXTime = utcTimeToPOSIXSeconds . zonedTimeToUTC
{- simple test that each parser parses its example date
- (does not check the result) -}
testParsers :: [Bool]
testParsers = map test parsers
where
test p = case applyParser (example p) p of
Just _ -> True
Nothing -> error $ "parser failed to parse its example: " ++ show p
signature.asc
Description: Digital signature
_______________________________________________ Haskell-Cafe mailing list [email protected] http://www.haskell.org/mailman/listinfo/haskell-cafe
