Yitzchak Gale wrote:
> The API for parsing and rendering time in Data.Time is
> based on the standard API for that in C - like the libraries
> in most languages. It's pretty standard stuff.
> 
> I'm sure it can be improved upon though. If you have a useful
> alternative time parsing library, please release it on Hackage.

In the perl world, there is a Date::Parse library that I have
found very useful in many situations. It can parse any date format
you throw at it, fairly reliably[1].

The closest I have in Haskell is the attached module, which I used to parse
all dates in all Usenet posts between 1981 and 1991. While small portions
are specific to Usenet messages it does have quite a lot of generic
parsing code, and perhaps most usefully, a large number of date format
strings, and I have thought about making a library if I find time.
(Or anyone is welcome to take it and do so.)

It's based on Data.Time.Parse instead of Data.Time.Format.parseTime,
probably because I didn't notice the latter when I was writing it.

-- 
see shy jo

[1] m/d/y vs d/m/y being the main weakness of course
{- Date parsing. The really horribly ugly bit, as there were few standards
 - on the early usenet. -}

module MsgTime where

import Data.Time.Parse
import Data.String.Utils
import Data.Maybe
import Data.Time.LocalTime
import Data.Time.Clock.POSIX
import Data.Char

import Msg

type TimeZoneParser = String -> Maybe (TimeZone, String)
type DateParser = String -> Maybe (LocalTime, String)

data TimeParser = TimeParser {
	dateparser :: DateParser,
	timezoneparser :: TimeZoneParser,
	example :: String
}

instance Show TimeParser where
	show (TimeParser _ _ e) = "TimeParser " ++ e

extractTime :: Msg -> Either String POSIXTime
extractTime m
	| null date = case parseTime mid of
		Left _ -> Left "missing Date header"
		Right t -> Right t
	| otherwise = case parseTime date of
		Left e -> Left e
		Right t -> 
			if t >= earliestSane
				then Right t
				else parseTime dateR -- try this instead
	where
		date = getHeader m (Header "Date")
		dateR = getHeader m (Header "Date-Received")
		mid = getHeader m (Header "Message-Id")

parseTime :: String -> Either String POSIXTime
parseTime s
	| null matches = Left $ "cannot parse: " ++ s
	| otherwise = Right $ matches !! 0
	where
		matches = catMaybes $ map (applyParser s) parsers

{- Applies a date parser and a timezone parser to the string, only
 - succeeding if the entire string is consumed. -}
applyParser :: String -> TimeParser -> Maybe POSIXTime
applyParser s (TimeParser dp zp _) =
	case dp (strip s) of
		Nothing -> Nothing
		Just (t, s') ->
			case zp (map toLower $ strip s') of
				Just (z, []) -> Just $ toPOSIXTime $ ZonedTime t z
				_ -> Nothing

earliestSane :: POSIXTime
earliestSane = toPOSIXTime $
	ZonedTime (fst $ fromJust $ strptime fmt cutoff) Data.Time.LocalTime.utc
	where
		cutoff = "1980-01-01 01:01:01"
		fmt = "%Y-%m-%d %T"

{- All the date formats you can shake a stick at.. and then some! -}
parsers :: [TimeParser]
parsers = 
	[ p anyzone "%d %b %y %T"		"15 Jun 88 02:27:41 GMT"
	, p anyzone "%a, %d %b %y %T"		"Thu, 22 Jun 89 20:02:03 GMT"
	, p anyzone "%a, %d-%b-%y %T"		"Thu, 15-Jun-89 18:01:56 EDT"
	, p anyzone "%d %b %y %T"		"8 Jan 90 14:07:27 -0400"
	, p anyzone "%d %b %y %H:%M"		"4 Oct 89 19:56 GMT"
	, p anyzone "%a, %d %b %y %H:%M"	"Thu, 23 May 91 02:13 PDT"
	, p anyzone "%a, %d %b %Y %T"		"Thu, 23 May 1991 07:07:00 -0400"
	, p anyzone "%a, %d %b %Y %H:%M"	"Sat, 18 May 1991 17:28 CDT"
	, p anyzone "%d %b %Y %T"		"11 Apr 1991 12:02:01 GMT"
	, p anyzone "%d-%b-%y %H:%M"		"24-Mar-90 14:22 CST"
	, p anyzone "%d %b %y, %T"		"22 May 91, 16:31:37 EST"
	, p anyzone "%a, %d %b T  %T"		"Fri, 8 Feb T  09:49:39 EST"
	, p anyzone "%d %b %Y %H:%M"		"30 June 1991 17:15 -0400"

	-- special cases
	, p (tzconst est) "%a %b %d %T EST %Y"	"Tue Jan 11 12:44:36 EST 1983"
	, p (tzconst est) "%a %b %d %T EST %y"	"Tue Jan 11 12:44:36 EST 83"
	, p (tzconst edt) "%a %b %d %T EDT %Y"	"Tue Jan 11 12:44:36 EDT 1983"
	, p (tzconst edt) "%a %b %d %T EDT %y"	"Tue Jan 11 12:44:36 EDT 83"
	, p (tzconst utc) "%a %b %d %T GMT %Y"	"Thu Nov  1 23:14:37 GMT 1990"
	, p (tzconst pdt) "%d %b %y %T -7"	"11 Jun 91 15:41:21 -7"

	-- dates with no timezone specified are guessed
	, p nozone "%d %b %y %T"		"9 Jan 90 09:33:59"
	, p nozone "%d %b %Y %T"		"10 APR 1990 05:25:28"
	, p nozone "%a %b %d %T %Y"		"Fri Feb  6 00:19:47 1981"
	, p nozone "%a %b %d %T %y"		"Fri Feb  6 00:19:47 81"
	, p nozone "%Y-%m-%d %T"		"1981-11-12 18:31:01"
	, p nozone "%y-%m-%d %T"		"81-11-12 18:31:01"
	, p nozone "%a, %d %b %y %T"		"Sat, 13 Apr 91 08:37:57"
	, p nozone "%a, %d %b %Y %T"		"Sun, 16 Jun 1991 13:23:02"
	, p nozone "%d %b, %Y %T"		"1 May, 1991 00:00:00"
	, p nozone "%d %b %y %H:%M"		"8 Jan 88 18:03"
	, p nozone "%a, %d %b %y %H:%M"		"Wed, 29 May 91 17:14"
	, p nozone "1 %b %d %T %Y"		"1 Jan 08 20:59:08 1991"

	-- this has to come near the end, as it matches greedily
	, g nozone "%a %b %d %T %Y ("		"Wed Oct 27 17:02:46 1982 (Tuesday)"
	, g nozone "%a, %d %b %y %T +"		"Tue, 21 May 91 16:46:01 +22323328"

	-- extract date from message-id headers
	-- (used for messages with no Date field)
	, g nozone "<%Y%b%d.%H%M%S."		"<[email protected]>"
	]
	where
		p z f e = TimeParser (strptime f) z e
		g z f e = TimeParser (greedystrptime f) z e

{- ignores trailing garbage -}
greedystrptime :: String -> DateParser
greedystrptime f s = 
	case strptime f s of
		Nothing -> Nothing
		Just (t, _) -> Just (t, "")

anyzone :: TimeZoneParser
anyzone "utc" = retzone utc
anyzone "ut"  = retzone utc
anyzone "gmt" = retzone utc
anyzone "edt" = retzone edt
anyzone "est" = retzone est
anyzone "cdt" = retzone cdt
anyzone "cst" = retzone cst
anyzone "mdt" = retzone mdt
anyzone "mst" = retzone mst
anyzone "pdt" = retzone pdt
anyzone "pst" = retzone pst
anyzone "hst" = retzone hst
anyzone "ast" = retzone ast -- ambiguous time zone, but alaska predominates
anyzone "nzt" = retzone $ hoursToTimeZone 12
anyzone "nzst" = anyzone "nzt"
anyzone "nzdt" = retzone $ hoursToTimeZone 13
anyzone "nzd" = anyzone "nzdt"
anyzone "nzs" = anyzone "nzt"
-- ambiguous but israel predominates
anyzone "ist" = retzone $ hoursToTimeZone 2
-- could also be Burma time, but checked some messages
-- and it was used by British
anyzone "bst" = retzone $ hoursToTimeZone 1
anyzone "cet" = retzone $ hoursToTimeZone 1
anyzone "wet" = retzone utc
anyzone "met" = anyzone "cet"
anyzone "cest" = retzone $ hoursToTimeZone 2
anyzone "mest" = anyzone "cet"
anyzone "mes" = anyzone "cet"
anyzone "mez" = anyzone "cet"
anyzone "tur" = retzone $ hoursToTimeZone 2 -- turkey
anyzone "lcl" = retzone est -- unknown
anyzone "dst" = retzone est -- unknown
anyzone "n" = retzone est -- unknown
anyzone "swe" = anyzone "cet" -- something swedish
anyzone "plt" = anyzone "pst" -- probably a typo
anyzone "u" = anyzone "utc" -- probably a typo
anyzone "brt" = retzone $ hoursToTimeZone (-3)
anyzone (sign:h1:h2:h3:h4:rest) = zoneOffset sign [h1,h2,h3,h4] rest
anyzone (sign:h1:h2:h3:rest) = zoneOffset sign [h1,h2,h3] rest
anyzone (sign:h1:h2:rest) = zoneOffset sign [h1,h2] rest
anyzone (sign:h1:rest) = zoneOffset sign [h1] rest
anyzone _ = Nothing

zoneOffset :: Char -> String -> TimeZoneParser
zoneOffset sign num rest = 
	case (sign, all isDigit num) of
		('-', True) -> res (-1)
		('+', True) -> res 1
		_ -> Nothing
	where
		-- XXX non-hour timezone offsets ignored
		res mult = Just (hoursToTimeZone $ mult * read num, rest)
		

{- Adds a US/Eastern timezone offset to a time.
 - 
 - When this is used, no actual time zone is known; we're just guessing.
 - Much Usenet traffic was on the East coast of the US; while some was
 - in California. In this case, it's better to guess, even if the
 - guess is wrong by 3 hours, than to leave a bad default assumption of UTC.
 -}
nozone :: TimeZoneParser
nozone = tzconst est

est :: TimeZone
est = hoursToTimeZone (-5)

edt :: TimeZone
edt = hoursToTimeZone (-4)
cdt :: TimeZone
cdt = hoursToTimeZone (-5)
cst :: TimeZone
cst = hoursToTimeZone (-6)
mdt :: TimeZone
mdt = hoursToTimeZone (-6)
mst :: TimeZone
mst = hoursToTimeZone (-7)
pdt :: TimeZone
pdt = hoursToTimeZone (-7)
pst :: TimeZone
pst = hoursToTimeZone (-8)
ast :: TimeZone
ast = hoursToTimeZone (-9)
hst :: TimeZone
hst = hoursToTimeZone (-10)

{- A timezone parser that sets a constant time zone, requiring the date
 - parse to have matched the whole string. -}
tzconst :: TimeZone -> TimeZoneParser
tzconst z "" = retzone z
tzconst _ _ = Nothing

retzone :: TimeZone -> Maybe (TimeZone, String)
retzone z = Just (z, "")

toPOSIXTime :: ZonedTime -> POSIXTime
toPOSIXTime = utcTimeToPOSIXSeconds . zonedTimeToUTC

{- simple test that each parser parses its example date
 - (does not check the result) -}
testParsers :: [Bool]
testParsers = map test parsers
	where
		test p = case applyParser (example p) p of
			Just _ -> True
			Nothing -> error $ "parser failed to parse its example: " ++ show p

Attachment: signature.asc
Description: Digital signature

_______________________________________________
Haskell-Cafe mailing list
[email protected]
http://www.haskell.org/mailman/listinfo/haskell-cafe

Reply via email to