I have a large RE (223613 chars) that works fine in CPython 2.6, but
seems to produce an endless loop in IronPython (see below).  I'm using
Mono 2.10 (.NET 4.0.x) on Ubuntu, with IronPython 2.7.  Anyone have
pointers to the differences between them?  Is
System::Text::RegularExpressions in .NET configurable in some fashion
that might help?

I'm a .NET newbie.

TIA,

Bill

--------------------------------------------------
import sys, os, re

try:
    # we use the name lists in nltk to create person-name matching patterns
    import nltk.data
except ImportError:
    sys.stderr.write("Can't import nltk; can't do name lists.\nSee 
http://www.nltk.org/.\n";)
    sys.exit(1)
else:
    __MALE_NAME_EXCLUDES = ("Hill",
                          "Ave",
                          )
    __FEMALE_NAME_EXCLUDES = ()
    __FEMALE_NAMES = [x for x in
                      nltk.data.load("corpora/names/female.txt", 
format="raw").split("\n")
                      if (x and (x not in __FEMALE_NAME_EXCLUDES))]
    __FEMALE_NAMES += [x.upper() for x in __FEMALE_NAMES]
    __MALE_NAMES = [x for x in
                    nltk.data.load("corpora/names/male.txt", 
format="raw").split("\n")
                    if (x and (x not in __MALE_NAME_EXCLUDES))]
    __MALE_NAMES += [x.upper() for x in __MALE_NAMES]
    __INITS = [chr(x) for x in range(ord('A'), ord('Z'))]

PERSON_PATTERN = re.compile(
    "^((?P<honorific>Mr|Ms|Mrs|Dr|MR|MS|MRS|DR)\.? )?"         # honorific
    "(?P<firstname>" +
    "|".join(__FEMALE_NAMES + __MALE_NAMES + __INITS) + # first name
    ")"
    "( (?P<middlename>([A-Z]\.)|(" +
    "|".join(__FEMALE_NAMES + __MALE_NAMES) +         # middle initial or name
    ")))?"
    " +(?P<lastname>[A-Z][A-Za-z]+)",             # space then last name
    re.MULTILINE)

print PERSON_PATTERN.match("Mr. John Smith")
_______________________________________________
Users mailing list
Users@lists.ironpython.com
http://lists.ironpython.com/listinfo.cgi/users-ironpython.com

Reply via email to