Re: Advice for a python newbie on parsing whois records?

Phillip B Oldham Wed, 11 Jun 2008 09:31:39 -0700

On Jun 10, 8:21 pm, Miki <[EMAIL PROTECTED]> wrote:
> Hello,
>
> > Hi. I'm stretching my boundaries in programming with a little python
> > shell-script which is going to loop through a list of domain names,
> > grab the whois record, parse it, and put the results into a csv.
>
> > I've got the results coming back fine, but since I have *no*
> > experience with python I'm wondering what would be the preferred
> > "pythonic" way of parsing the whois string into a csv record.
>
> > Tips/thoughts/examples more than welcome!
>
> from os import popen
> import re
>
> find_record = re.compile("\s+([^:]+): (.*)\s*").match
> for line in popen("whois google.com"):
>     match = find_record(line)
>     if not match:
>         continue
>     print "%s --> %s" % (match.groups()[0], match.groups()[1])
>
> HTH,
> --
> Miki <[EMAIL PROTECTED]>http://pythonwise.blogspot.com


OK, here's what I've got so far. I'm treating this as a learning
exercise, so the resulting file isn't so important as understanding
and thinking in python (although I believe the results are adequate
for my needs). I'd appreciate the community's comments as this is my
*first* attempt at python and has taken me a couple of hours
(including googling).

#!/usr/bin/env python
import subprocess
import re

src = open('./domains.txt')

dest = open('./whois.csv', 'w');

def trim( txt ):
        x = []
        for line in txt.split("\n"):
                if line.strip() == "":
                        continue
                if line.strip().startswith('WHOIS'):
                        continue
                if line.strip().startswith('>>>'):
                        continue
                if line.strip().startswith('%'):
                        continue
                if line.startswith("--"):
                        return ''.join(x)
                x.append(" "+line)
        return "\n".join(x)

def clean( txt ):
        x = []
        isok = re.compile("^\s?([^:]+): ").match
        for line in txt.split("\n"):
                match = isok(line)
                if not match:
                        continue
                x.append(line)
        return "\n".join(x);

def clean_co_uk( rec ):
        rec = rec.replace('Company number:', 'Company number -')
        rec = rec.replace("\n\n", "\n")
        rec = rec.replace("\n", "")
        rec = rec.replace(": ", ":\n")
        rec = re.sub("([^(][a-zA-Z']+\s?[a-zA-Z]*:\n)", "\n\g<0>", rec)
        rec = rec.replace(":\n", ": ")
        rec = re.sub("^[ ]+\n", "", rec)
        return rec

def clean_net( rec ):
        rec = rec.replace("\n\n", "\n")
        rec = rec.replace("\n", "")
        rec = rec.replace(": ", ":\n")
        rec = re.sub("([a-zA-Z']+\s?[a-zA-Z]*:\n)", "\n\g<0>", rec)
        rec = rec.replace(":\n", ": ")
        return rec

def clean_info( rec ):
        x = []
        for line in rec.split("\n"):
                x.append(re.sub("^([^:]+):", "\g<0> ", line))
        return "\n".join(x)

def record(domain, record):

        ## Records are as follows: [ domain, registrant, registrant's address
registrar, type, registered, renewal, updated name servers ]
        details = ['','','','','','','','','']
        for k, v in record.items():
                try:
                        details[0] = domain.lower()
                        result = {
                                "registrant": lambda: 1,
                                "registrant name": lambda: 1,
                                "registrant type": lambda: 4,
                                "registrant's address": lambda: 2,
                                "registrant address1": lambda: 2,
                                "registrar": lambda: 3,
                                "sponsoring registrar": lambda: 3,
                                "registered on": lambda: 5,
                                "registered": lambda: 5,
                                "domain registeration date": lambda: 5,
                                "renewal date": lambda: 6,
                                "last updated": lambda: 7,
                                "domain last updated date": lambda: 7,
                                "name servers": lambda: 8,
                                "name server": lambda: 8,
                                "nameservers": lambda: 8,
                                "updated date": lambda: 7,
                                "creation date": lambda: 5,
                                "expiration date": lambda: 6,
                                "domain expiration date": lambda: 6,
                                "administrative contact": lambda: 2
                        }[k.lower()]()
                        if v != '':
                                details[result] = v
                except:
                        continue

        dest.write('|'.join(details)+"\n")

## Loop through domains
for domain in src:

        domain = domain.strip()

        if domain == '':
                continue

        rec = subprocess.Popen(["whois",domain],
stdout=subprocess.PIPE).communicate()[0]

        if rec.startswith("No whois server") == True:
                continue

        if rec.startswith("This TLD has no whois server") == True:
                continue

        rec = trim(rec)

        if domain.endswith(".net"):
                rec = clean_net(rec)

        if domain.endswith(".com"):
                rec = clean_net(rec)

        if domain.endswith(".tv"):
                rec = clean_net(rec)

        if domain.endswith(".co.uk"):
                rec = clean_co_uk(rec)

        if domain.endswith(".info"):
                rec = clean_info(rec)

        rec = clean(rec)

        details = {}

        try:
                for line in rec.split("\n"):
                        bits = line.split(': ')
                        a = bits.pop(0)
                        b = bits.pop(0)
                        details[a.strip()] = b.strip().replace("\t", ", ")
        except:
                continue

        record(domain, details)

src.close()
dest.close()
--
http://mail.python.org/mailman/listinfo/python-list

Re: Advice for a python newbie on parsing whois records?

Reply via email to