Hello,

I'm no lxml expert, so it could be a newbie error…but the following web 
scrawler script sometimes breaks (see "BUG") while trying to find the number of 
provinces/properties, even after two one-second sleeps:

==========
import requests
from lxml import html
import re
import math
import time

def grab_properties():
        properties = soup.xpath("//div[contains(@class,'gallery')]/a/@href")
        for property in properties:
                print(property)
                response = requests.get(property)
                coords = pattern_coords.search(response.text) #raw HTML since 
data in JSON
                if coords:
                        lat,lon=coords.group(1),coords.group(2)
                        print(f"{lat}\t{lon}")

pattern_count = re.compile("(\d+) Propertie") #ignore trailing s for 
singular/plural
pattern_coords = re.compile("latitude:(.+?),longitude:([^}]+)") #JSON
provinces = ["a", "b", "c"]
for province in provinces:
        time.sleep(1) #added but still no cigar

        url = f"https://www.acme.com/{province}/";
        print("======== ",url)
        response = requests.get(url)
        soup = html.fromstring(response.text)

        #BUG time-out?
        count = soup.xpath("//div[contains(@class,'properties-count')]/text()")
        print(count)
        count = pattern_count.search(count[0])
        if count:
                print("Number of locations:",count)

        locations = 
soup.xpath("//div[contains(@class,'other-location-box')]/a/@href")
        for location in locations:
                time.sleep(1) #added but still no cigar
                print(location)
                response = requests.get(location)
                soup = html.fromstring(response.text)

                #BUG time-out?
                count = 
soup.xpath("//div[contains(@class,'properties-count')]/text()")
                print(count)
                count = pattern_count.search(count[0])
                if not count:
                        print("Number of properties not found")
                        break #next location
                else:
                        print("Number of properties found",count.group(1))

                        #grab what's in current, first page
                        grab_properties()

                        #If > 30, must update URL and loop through pages by 
groups of 30
                        count = int(count.group(1))
                        for index in range (2,math.ceil(count/30)+1):
                                time.sleep(1) #added but still no cigar
                                url = f"{location}p/{index}/" #new URL
                                response = requests.get(url)
                                soup = html.fromstring(response.text)
                                grab_properties()
==========

Am I using the wrong syntax to grab the numbers?

Thank you.
_______________________________________________
lxml - The Python XML Toolkit mailing list -- lxml@python.org
To unsubscribe send an email to lxml-le...@python.org
https://mail.python.org/mailman3/lists/lxml.python.org/
Member address: arch...@mail-archive.com

Reply via email to