On Jun 10, 8:21 pm, Miki <[EMAIL PROTECTED]> wrote: > Hello, > > > Hi. I'm stretching my boundaries in programming with a little python > > shell-script which is going to loop through a list of domain names, > > grab the whois record, parse it, and put the results into a csv. > > > I've got the results coming back fine, but since I have *no* > > experience with python I'm wondering what would be the preferred > > "pythonic" way of parsing the whois string into a csv record. > > > Tips/thoughts/examples more than welcome! > > from os import popen > import re > > find_record = re.compile("\s+([^:]+): (.*)\s*").match > for line in popen("whois google.com"): > match = find_record(line) > if not match: > continue > print "%s --> %s" % (match.groups()[0], match.groups()[1]) > > HTH, > -- > Miki <[EMAIL PROTECTED]>http://pythonwise.blogspot.com
OK, here's what I've got so far. I'm treating this as a learning exercise, so the resulting file isn't so important as understanding and thinking in python (although I believe the results are adequate for my needs). I'd appreciate the community's comments as this is my *first* attempt at python and has taken me a couple of hours (including googling). #!/usr/bin/env python import subprocess import re src = open('./domains.txt') dest = open('./whois.csv', 'w'); def trim( txt ): x = [] for line in txt.split("\n"): if line.strip() == "": continue if line.strip().startswith('WHOIS'): continue if line.strip().startswith('>>>'): continue if line.strip().startswith('%'): continue if line.startswith("--"): return ''.join(x) x.append(" "+line) return "\n".join(x) def clean( txt ): x = [] isok = re.compile("^\s?([^:]+): ").match for line in txt.split("\n"): match = isok(line) if not match: continue x.append(line) return "\n".join(x); def clean_co_uk( rec ): rec = rec.replace('Company number:', 'Company number -') rec = rec.replace("\n\n", "\n") rec = rec.replace("\n", "") rec = rec.replace(": ", ":\n") rec = re.sub("([^(][a-zA-Z']+\s?[a-zA-Z]*:\n)", "\n\g<0>", rec) rec = rec.replace(":\n", ": ") rec = re.sub("^[ ]+\n", "", rec) return rec def clean_net( rec ): rec = rec.replace("\n\n", "\n") rec = rec.replace("\n", "") rec = rec.replace(": ", ":\n") rec = re.sub("([a-zA-Z']+\s?[a-zA-Z]*:\n)", "\n\g<0>", rec) rec = rec.replace(":\n", ": ") return rec def clean_info( rec ): x = [] for line in rec.split("\n"): x.append(re.sub("^([^:]+):", "\g<0> ", line)) return "\n".join(x) def record(domain, record): ## Records are as follows: [ domain, registrant, registrant's address registrar, type, registered, renewal, updated name servers ] details = ['','','','','','','','',''] for k, v in record.items(): try: details[0] = domain.lower() result = { "registrant": lambda: 1, "registrant name": lambda: 1, "registrant type": lambda: 4, "registrant's address": lambda: 2, "registrant address1": lambda: 2, "registrar": lambda: 3, "sponsoring registrar": lambda: 3, "registered on": lambda: 5, "registered": lambda: 5, "domain registeration date": lambda: 5, "renewal date": lambda: 6, "last updated": lambda: 7, "domain last updated date": lambda: 7, "name servers": lambda: 8, "name server": lambda: 8, "nameservers": lambda: 8, "updated date": lambda: 7, "creation date": lambda: 5, "expiration date": lambda: 6, "domain expiration date": lambda: 6, "administrative contact": lambda: 2 }[k.lower()]() if v != '': details[result] = v except: continue dest.write('|'.join(details)+"\n") ## Loop through domains for domain in src: domain = domain.strip() if domain == '': continue rec = subprocess.Popen(["whois",domain], stdout=subprocess.PIPE).communicate()[0] if rec.startswith("No whois server") == True: continue if rec.startswith("This TLD has no whois server") == True: continue rec = trim(rec) if domain.endswith(".net"): rec = clean_net(rec) if domain.endswith(".com"): rec = clean_net(rec) if domain.endswith(".tv"): rec = clean_net(rec) if domain.endswith(".co.uk"): rec = clean_co_uk(rec) if domain.endswith(".info"): rec = clean_info(rec) rec = clean(rec) details = {} try: for line in rec.split("\n"): bits = line.split(': ') a = bits.pop(0) b = bits.pop(0) details[a.strip()] = b.strip().replace("\t", ", ") except: continue record(domain, details) src.close() dest.close() -- http://mail.python.org/mailman/listinfo/python-list