[Haskell-cafe] user error when using Text.Regex.PCRE

Olivier Boudry Mon, 19 Nov 2007 14:52:42 -0800

Hi all,

I'm getting a strange user error when working with Data.ByteString.Char8 and
Text.Regex.PCRE.


Error I get is:
CustomerMaster: user error (Text.Regex.PCRE.ByteString died: (ReturnCode
0,"Ptr
parameter was nullPtr in Text.Regex.PCRE.Wrap.wrapMatch cstr"))

The part of the code causing problems is at the end of this e-mail. The
complete code is attached.

I'm working at cleansing customer addresses for a ERP system data migration.
The failing function is supposed to split some bytestrings using a list of
regular expressions (split on words like suite, building, attention, etc...)
stored in the "tags" binding. I use a foldl' to try each regular expression
on each address (addr is a list of address part, can be seen as a list of
bytestring values).

The funny and weird thing is that when I have only one regular expression in
tags, it works. When I have more than one regular expression in tags it
fails (user error) the first time one of the regular expression matches. A
match replaces the current address part with 3 new parts, before, match,
after. The next regular expression in the tags list will test the new before
and after parts. I presume it's the testing of those newly created parts
that causes the problem.

Any idea about what could go wrong here?

Thanks,

Olivier.

=== code ===

-- | Split unparsed address parts on keywords (suite, building, doors,
etc...)

splitOnTags :: AddressState
splitOnTags = do
  addr <- get
  put $ foldl' f addr tags
  where
    f a t   = concatMap (split t) a
    split (T {partType = p, regex = r}) a@(AP X v)
      | BS.null y    = [a]
      | otherwise = [AP X x, AE p, AP X z] -- new address parts
      where
        (x, y, z) = match r v
    split _ a = [a]

-- | Regex and part type representing tags used for splitting addresses

tags :: [Tag]
tags = [T AT
          (makeRegexOpts compCaseless execBlank
            "\\b(?:ATTN|ATTENTION|C/O)\\b[[:punct:]]?")
          (BS.pack "ATTN: ")
       ,T PB
          (makeRegexOpts compCaseless execBlank
            "\\b(?:(?:P\\.?\\s?O\\.?\\s*)BOX|C\\.P\\.|POSTFACH)\
            \[[:punct:]]?\\s+(?:NO.\\s+)?")
          (BS.pack "PO BOX: ")
       ,T BLDG
          (makeRegexOpts compCaseless execBlank
            "\\b(?:BLDG|BUILDING|HANGAR)[[:punct:]]?")
          (BS.pack "BLDG: ")
       ,T DOCK
          (makeRegexOpts compCaseless execBlank
            "\\bDOCK[[:punct:]]?")
          (BS.pack "DOCK: ")
       ,T STE
          (makeRegexOpts compCaseless execBlank
            "\\b(?:SUITE|STE|APT|ROOM)[[:punct:]]?\
            \\\s+(?:NO[[:punct:]]?)?")
          (BS.pack "STE: ")
       ,T UNIT
          (makeRegexOpts compCaseless execBlank
            "\\bUNIT[[:punct:]]?")
          (BS.pack "UNIT: ")
       ,T FLOOR
          (makeRegexOpts compCaseless execBlank
            "FLOOR")
          (BS.pack "FLOOR: ")
       ,T DOORS
          (makeRegexOpts compCaseless execBlank
            "\\bDOORS?[[:punct:]]?")
          (BS.pack "DOORS: ")]

module Main where

import Control.Monad.State
import qualified Data.ByteString.Char8 as BS
import Data.Char (isSpace)
import Data.List
import System.Environment
import System.IO
import Text.Regex.PCRE

--------------------------------------------------------------------------------
-- Data declarations
--------------------------------------------------------------------------------

-- | An address

type Address = [AddressPart]

-- | An address part (Suite, Dock, City, etc...) or a line separator

data AddressPart = AP AddressPartType BS.ByteString
                 | AE AddressPartType -- tag found but not linked to text
                 | S  AddressPartType -- line separator
                 deriving (Show, Eq)

-- | Tags for address parts

data AddressPartType = I      -- Id
                     | N      -- Name
                     | PF     -- Partner function
                     | AT     -- Attention
                     | N2     -- Name 2
                     | PB     -- P.O. Box
                     | BLDG   -- Building
                     | DOCK   -- Dock
                     | STE    -- Suite
                     | UNIT   -- Unit
                     | FLOOR  -- Floor
                     | DOORS  -- Doors
                     | ST     -- Street
                     | CI     -- City
                     | CO     -- Country
                     | R      -- Region
                     | Z      -- Zip code
                     | X      -- Unparsed
                     | E      -- Error message
                     | L1     -- Start of Line 1
                     | L2     -- Start of Line 2
                     | L3     -- Start of Line 3
                     | L4     -- Start of Line 4 (zip code)
                     deriving (Show, Eq)

-- | The address state the program will run in

type AddressState = State Address ()

data Tag = T {partType :: AddressPartType,
              regex    :: Regex,
              partName :: BS.ByteString}

--------------------------------------------------------------------------------
-- Formating and extraction of address parts
--------------------------------------------------------------------------------

-- | format address part for printing

format :: AddressPart -> BS.ByteString
format (S _)        = BS.empty
format (AE p)       = format (AP p tagEMPTY)
format (AP PB    v) = BS.append tagPOBOX v
format (AP BLDG  v) = BS.append tagBLDG  v
format (AP DOCK  v) = BS.append tagDOCK  v
format (AP STE   v) = BS.append tagSTE   v
format (AP UNIT  v) = BS.append tagUNIT  v
format (AP FLOOR v) = BS.append tagFLOOR v
format (AP DOORS v) = BS.append tagDOORS v
format (AP X     v) = BS.append tagX     v
format (AP _     v) = v

-- | ByteStringS used for reporting

tagPOBOX, tagBLDG, tagDOCK, tagSTE, tagUNIT, tagFLOOR, tagDOORS, tagX, tagEMPTY
  :: BS.ByteString
tagPOBOX = BS.pack "PO BOX: "
tagBLDG  = BS.pack "BLDG: "
tagDOCK  = BS.pack "DOCK: "
tagSTE   = BS.pack "STE: "
tagUNIT  = BS.pack "UNIT: "
tagFLOOR = BS.pack "FLOOR: "
tagDOORS = BS.pack "DOORS: "
tagX     = BS.pack "XXX: "
tagEMPTY = BS.pack "!!!EMPTY!!!"

-- Get address part value

value :: AddressPart -> BS.ByteString
value (AP _ v) = v
value (S _)    = error "No value: AddressPart type S is a separator"
value (AE t)   = BS.append (BS.pack "AE: ") (BS.pack $ show t)
--value (AE _)   = error "No value: AddressPart type AE contains no value" -- TODO reactivate

-- Apply a ByteString conversion function to an AddressPart

apply :: (BS.ByteString -> BS.ByteString) -> AddressPart -> AddressPart
apply f (AP t v) = AP t (f v)
apply _ p        = p

--------------------------------------------------------------------------------
-- IO functions
--------------------------------------------------------------------------------

-- | The App entry point

main :: IO ()
main = do
  args <- getArgs
  case args of
    (file_name:[]) -> processFile file_name
    _              -> usage

-- | The IO stuff

processFile :: String -> IO ()
processFile file_name = do
  content <- BS.readFile file_name
  let result = parseAddresses content
  BS.putStrLn result

-- | How to use this application

usage :: IO ()
usage = do
  progName <- getProgName
  hPutStrLn stderr $ "Usage:\n\t" ++ progName ++ " file_to_parse.txt"

--------------------------------------------------------------------------------
-- Parsing functions
--------------------------------------------------------------------------------

-- | Parse the text flow

parseAddresses :: BS.ByteString -> BS.ByteString
parseAddresses = addHeader
               . BS.unlines
               . map (reportAddress . validateAddress . parseAddressLine)
               . drop 1
               . BS.lines

-- | add a header line to given ByteString

addHeader :: BS.ByteString -> BS.ByteString
addHeader = BS.append (BS.pack
    "Id\tName\tPartnerFunc\tName 2\tAttn\tPO Box\tBldg\tDock\t\
    \Suite\tUnit\tFloor\tDoors\tStreet\tCity\tCountry\t\
    \Regio/State\tZip\tErrors\tUnparsed\n")

-- | Parse an address line

parseAddressLine :: BS.ByteString -> Address
parseAddressLine s = do
  splitOnTags
  `execState` ([AP I uid, AP PF pf, AP N name]
                   ++ [S L1, AP X add1, S L2, AP X add2, S L3, AP X add3]
                   ++ [S L4] ++ zipCode)
  where
    (_:uid:pf:name:add1:add2:add3:zipC:[]) = BS.split '\t' s
    zipCode = if BS.null zipCT then [] else [AP Z zipCT]
    zipCT = trim zipC

-- | Validate an address line

validateAddress :: Address -> Address
validateAddress = id

-- | Report an address line

reportAddress :: Address -> BS.ByteString
reportAddress addr
  = BS.intercalate (BS.singleton '\t') (map (\f -> f addr)
      [column (isPartType I)
      ,column (isPartType N)
      ,column (isPartType PF)
      ,column (isPartType N2)
      ,column (isPartType AT)
      ,column (isPartType PB)
      ,column (isPartType BLDG)
      ,column (isPartType DOCK)
      ,column (isPartType STE)
      ,column (isPartType UNIT)
      ,column (isPartType FLOOR)
      ,column (isPartType DOORS)
      ,column (isPartType ST)
      ,column (isPartType CI)
      ,column (isPartType CO)
      ,column (isPartType R)
      ,column (isPartType Z)
      ,column (isPartType E)]
      ++ (map value $ filter (isPartType X) addr))

-- | Write column data

column :: (AddressPart -> Bool) -> Address -> BS.ByteString
column test = BS.intercalate (BS.pack " || ") . map value . filter test

-- | Check if given AddressPart is of given type

isPartType :: AddressPartType -> AddressPart -> Bool
isPartType t (AP t' _) = t == t'
isPartType t (AE t')   = t == t'
isPartType t (S t')    = t == t'

-- | Remove whitespace from ends of ByteString

trim :: BS.ByteString -> BS.ByteString
trim = BS.reverse . BS.dropWhile isSpace . BS.reverse . BS.dropWhile isSpace

-- | Split unparsed address parts on keywords (suite, building, doors, etc...)

splitOnTags :: AddressState
splitOnTags = do
  addr <- get
  put $ foldl' f addr tags
  where
    f a t   = concatMap (split t) a
    split (T {partType = p, regex = r}) a@(AP X v)
      | BS.null y    = [a]
      | otherwise = [AP X x, AE p, AP X z]
      where
        (x, y, z) = match r v
    split _ a = [a]

-- | Regex and part type representing tags used for splitting addresses

tags :: [Tag]
tags = [T AT
          (makeRegexOpts compCaseless execBlank
            "\\b(?:ATTN|ATTENTION|C/O)\\b[[:punct:]]?")
          (BS.pack "ATTN: ")
       ,T PB
          (makeRegexOpts compCaseless execBlank
            "\\b(?:(?:P\\.?\\s?O\\.?\\s*)BOX|C\\.P\\.|POSTFACH)\
            \[[:punct:]]?\\s+(?:NO.\\s+)?")
          (BS.pack "PO BOX: ")
       ,T BLDG
          (makeRegexOpts compCaseless execBlank
            "\\b(?:BLDG|BUILDING|HANGAR)[[:punct:]]?")
          (BS.pack "BLDG: ")
       ,T DOCK
          (makeRegexOpts compCaseless execBlank
            "\\bDOCK[[:punct:]]?")
          (BS.pack "DOCK: ")
       ,T STE
          (makeRegexOpts compCaseless execBlank
            "\\b(?:SUITE|STE|APT|ROOM)[[:punct:]]?\
            \\\s+(?:NO[[:punct:]]?)?")
          (BS.pack "STE: ")
       ,T UNIT
          (makeRegexOpts compCaseless execBlank
            "\\bUNIT[[:punct:]]?")
          (BS.pack "UNIT: ")
       ,T FLOOR
          (makeRegexOpts compCaseless execBlank
            "FLOOR")
          (BS.pack "FLOOR: ")
       ,T DOORS
          (makeRegexOpts compCaseless execBlank
            "\\bDOORS?[[:punct:]]?")
          (BS.pack "DOORS: ")]

_______________________________________________
Haskell-Cafe mailing list
Haskell-Cafe@haskell.org
http://www.haskell.org/mailman/listinfo/haskell-cafe

[Haskell-cafe] user error when using Text.Regex.PCRE

Reply via email to