getAddresses.py

Scrapes addresses from www.usdirectory.com and stores them in a SQLite database, or writes them to text files for mailing labels, etc

Now, just by typing 'fast food Taco Bell <city> 10 db all' you can find out how many Taco Bells are within 10 miles of you, and store all the addresses in your own address database.

No more convoluted Googling, or hitting the 'Next Page' button, or fumbling with the Yellow Pages...

Note: the db structure is flat on purpose, and the .csv files aren't quote delimited.

Put the program in its own directory. It creates the SQLite database there, and writes files there, too.

Reviews of code, bug reports, criticisms, suggestions for improvement, etc are all welcome.

Enjoy!



========================================================================
#getAddresses.py

import os, sys, requests, time, datetime
from lxml import html
import pyodbc, sqlite3, re


#show values of variables, HTML content, etc
#set it to False for short/concise program output
verbose = False
if verbose == True:
        print "The verbose setting is turned On."
        print ""

        
#check if address is unique
addrCheck = []
def addrUnique(addr):
        if addr not in addrCheck:
                x = True
                addrCheck.append(addr)
        else: x = False 
        return x

        
#validate and parse command line
def showHelp():
        print ""
print " Enter search word(s), city or zip, state, miles to search, txt or csv or db, # addresses to save (no commas)"
        print ""
        print " eg: restaurant Knoxville TN 10 txt 50"
print " search for restaurants within 10 miles of Knoxville TN, and write"
        print "     the first 50 address to a txt file"
        print ""
        print " eg: furniture 30303 GA 20 csv all"
        print "     search for furniture within 20 miles of zip 30303 GA,"
        print "     and write all results to a csv file"
        print ""
        print " eg: boxing gyms Detroit MI 10 db 5"
print " search for boxing gyms within 10 miles of Detroit MI, and store"
        print "     the first 5 results in a database"
        print ""
        print " All entries are case-insensitive (ie TX or tx are acceptable)"
        exit(0)

argCnt = len(sys.argv)
if argCnt < 7: showHelp()
if verbose == True:
        print ""
        print str(argCnt) + " arguments"

keyw = ""                                                     #eg restaurant, 
boxing gym
if argCnt == 7: keyw = sys.argv[1]      #one search word
if argCnt >  7:                                      #multiple search words
        for i in range(1,argCnt-5):
                keyw = keyw + sys.argv[i] + "+"
        keyw = keyw[:-1]                        #drop trailing + sign
cityzip  = sys.argv[argCnt-5]   #eg Atlanta or 30339
state    = sys.argv[argCnt-4]   #eg GA
miles    = sys.argv[argCnt-3]   #eg 5,10,20,30,50 (website allows max 30)
store    = sys.argv[argCnt-2]   #write address to file or database
addrWant = sys.argv[argCnt-1]   #eg save All or number >0

if addrWant.lower() != "all": #how many addresses to save
        if addrWant.isdigit() == False: showHelp()
        if addrWant == "0"            : showHelp()
        addrWant = int(addrWant)
elif addrWant.lower() == "all": addrWant = addrWant.lower()
else: addrWant = int(addrWant)

if store != "csv" and store != "txt" and store != "db": showHelp()


#begin timing the code
startTime = time.clock()        


#website, SQLite db, search string, current date/time for use with db
datasrc = "www.usdirectory.com"
dbName  = "addresses.sqlite"
search = keyw + " " + str(cityzip) + " " + state + " " + str(miles) + " " + str(addrWant)
loaddt = datetime.datetime.now()


#write addresses to file
#each time the same search is done, the file is deleted and recreated
if store == "csv" or store == "txt":
        #csv will write in .csv format - header and 1 line per address
        #txt will write out 3 lines per address, then blank before next address
        webfile  = "usdirectory.com_"+keyw+"_"+cityzip+"_"+state+"."+store
        f = open(webfile,"w")
        if store == "csv": f.write("Name,Address,CityStateZip\n")
        f.close


#store addresses in database    
cSQL = ""
if store == "db":     
        #creates a SQLite database that Access 2003 can't read
        #conn = sqlite3.connect(dbName)

        #also creates a SQLite database that Access 2003 can't read
        conn = pyodbc.connect('Driver={SQLite3 ODBC Driver};Database=' + dbName)
        db   = conn.cursor()

        cSQL =  "CREATE TABLE If Not Exists ADDRESSES "
cSQL += "(datasrc, search, category, name, street, city, state, zip, loaddt, "
        cSQL += "PRIMARY KEY (datasrc, search, name, street));"
        db.execute(cSQL)
        
        # cSQL =  "CREATE TABLE If Not Exists CATEGORIES "
        # cSQL += "(catID INTEGER PRIMARY KEY, catDesc);"
        # db.execute(cSQL)
# db.execute("CREATE UNIQUE INDEX If Not Exists UIDX_CATDESC ON CATEGORIES (catDesc);")
        
        conn.commit()
        if verbose == True:
                print("connected to database: " + dbName)
                print cSQL
                print("created table: addresses")
                print("")
        
        
if verbose == True:
        print "Search summary"
        print "------------------------------"
        print "Keywords: " + keyw
        print "City/Zip: " + cityzip
        print "State   : " + state
        print "Radius  : " + str(miles)    + " miles"
        print "Save    : " + str(addrWant) + " addresses to " + store
        print "------------------------------"
        print ""
        
#build url
wBase    = "http://www.usdirectory.com";
wForm    = "/ypr.aspx?fromform=qsearch"
wKeyw   = "&qhqn=" + keyw
wCityZip = "&qc="   + cityzip
wState   = "&qs="   + state
wDist    = "&rg="   + str(miles)
wSort    = "&sb=a2z"  #sort alpha
wPage    = "&ap="   #used with the results page number
webpage = wBase + wForm + wKeyw + wCityZip + wState + wDist
if verbose == True:
        print "Search url: \n" + webpage
        print ""


#delete previous results of identical search
if store == "db":
        cSQL = "DELETE FROM addresses "
        cSQL += "WHERE datasrc = '" + datasrc + "' "
        cSQL += "AND   search  = '" + search + "';"
        if verbose == True: print cSQL
        db.execute(cSQL)
        conn.commit()

#query web server, save results
print "searching..."
i = 0
dupes = 0
addrReturned = 0
addrSaved = 0
while 1:
        wPageNbr = wPage + str(i+1)
webpage = wBase + wForm + wKeyw + wCityZip + wState + wDist + wSort + wPageNbr
        page = requests.get(webpage)
        tree = html.fromstring(page.content)
        
        #no matches
        matches = tree.xpath('//strong/text()')         
        if i == 0 and "No results were found" in str(matches):
                print "No results found for that search"
                exit(0)
                os.remove(webfile)
                
        #parse number of addresses returned
        #some searches return 2 items: ['Found N results', 'junk']
#some searches return 3 items: ['Filter this search','Found N results','junk']
        if i == 0:
                match  = tree.xpath('//div[@class="header_text"]/text()')
                if len(match) > 2: match.pop(0) #remove first element if 2
                match = [int(s) for s in match[0].split() if s.isdigit()]
                addrFound = match[0]
                if addrWant != "all": addrWant = min(addrWant,addrFound)
print str(addrFound) + " matches found (" + str(addrWant) + " will be saved)"
                print ""
        
        
        #split names, addresses into lists
        nms   = tree.xpath('//span[@class="header_text3"]/text()')
        if len(nms) == 0: break
        addr   = tree.xpath('//span[@class="text3"]/text()')
        addr   = [t.replace("\r\n", "")  for t in addr]
        addr   = filter(None, (t.strip() for t in addr))
        street = [s.split(',')[0] for s in addr]
        city   = [c.split(',')[1].strip() for c in addr]
        state  = [s[-8:][:2] for s in addr]
        zip    = [z[-5:] for z in addr]

        #get usdirectory.com categories
        category = tree.xpath('//a/text()')
        category = [c.strip() for c in category]
        category = filter(None, category)
        pattern  = re.compile(r"^[A-Z\s&,-]+$")
        category = [x for x in category if pattern.match(x)]
        
        #screen feedback
        print "retrieving page " + str(i+1)   + ": " + str(len(nms)) + " 
addresses"
        if verbose == True:
                print ""
                print "Names: \n" + str(nms)
                print ""
                print "Addresses: \n" + str(addr)
                print ""
                print "Categories: \n" + str(category)
print "----------------------------------------------------------------------------------------"
                print ""
                
        #data integrity check - make sure all lists have same # of items
lenData = [len(category),len(nms),len(addr),len(street),len(city),len(state),len(zip)]
        if len(set(lenData)) != 1:
print "Data parsing issue. One or more lists has an incorrect number of items. Program will exit."
                exit(0)
        if verbose == True:
if len(set(lenData)) == 1: print "Verified: each list has " + str(len(nms)) + " items in it."
        
        
        #write addresses to file
        if store == "txt" or store == "csv":
                addrList = []
                for j in range(len(nms)):
                        if addrUnique(nms[j]+' '+street[j]) == True:
if store == "txt": addrList.append(nms[j]+'\n'+street[j]+'\n'+city[j]+', '+state[j]+' '+zip[j]+'\n\n') if store == "csv": addrList.append(nms[j]+',' +street[j]+',' +city[j]+' ' +state[j]+' '+zip[j]+'\n')
                                addrSaved += 1
                        else:
                                dupes += 1
                                print " * duplicate address found: " + nms[j] + ", 
" + street[j]
                        addrReturned += 1                       
                        if addrWant != "all":
                                if addrSaved >= addrWant: break
                        
                f = open(webfile,"a")
                for address in addrList: f.write(address)
                f.close

                
        #write addresses to database
        if store == "db":
                for j in range(len(nms)):
                        dupeRow = False
cSQL = "INSERT INTO ADDRESSES VALUES (?,?,?,?,?,?,?,?,?)" #(datasrc,search,category,name,street,city,state,zip,loaddt) " Vals = datasrc,search,category[j],nms[j],street[j],city[j],state[j],zip[j],str(loaddt)
                        if verbose == True: print cSQL + ',' + str(Vals)
                        try: db.execute(cSQL, Vals)
                        except (pyodbc.Error) as programError:
                                if str(programError).find("UNIQUE constraint 
failed") > 0:
                                        dupeRow = True
                                        dupes +=1
                                        print " * duplicate address found: " + nms[j] + 
", " + street[j]
                                        pass
                        addrReturned += 1               
                        if dupeRow == False:
                                addrSaved += 1
                        if addrWant != "all":
                                if addrSaved >= addrWant: break
                conn.commit()
                
                
        if addrSaved >= addrFound or addrSaved >= addrWant: break
        i += 1
        time.sleep(2)

        
#finish
if (store == "csv" or store == "txt"):
print "\nFinished\nWrote " + str(addrSaved) + " addresses to file " + webfile

elif store == "db":
        db.close()
        conn.close()
print "\nFinished\nStored " + str(addrSaved) + " addresses in database: " + dbName
        
if dupes > 0: print "(" + str(dupes) + " duplicate addresses ignored)"
        
#timer
endTime = time.clock()
print "processing time: %.2g seconds" %(endTime-startTime)

#bug in www.usdirectory.com code: usually overreports matches by 1
if (addrWant == "all") and (addrReturned != addrFound):
print "Note: " + datasrc + " reported " + str(addrFound) + " matches, but returned " + str(addrReturned)
========================================================================        


--
https://mail.python.org/mailman/listinfo/python-list

Reply via email to