Hello, I'm no lxml expert, so it could be a newbie error…but the following web scrawler script sometimes breaks (see "BUG") while trying to find the number of provinces/properties, even after two one-second sleeps:
========== import requests from lxml import html import re import math import time def grab_properties(): properties = soup.xpath("//div[contains(@class,'gallery')]/a/@href") for property in properties: print(property) response = requests.get(property) coords = pattern_coords.search(response.text) #raw HTML since data in JSON if coords: lat,lon=coords.group(1),coords.group(2) print(f"{lat}\t{lon}") pattern_count = re.compile("(\d+) Propertie") #ignore trailing s for singular/plural pattern_coords = re.compile("latitude:(.+?),longitude:([^}]+)") #JSON provinces = ["a", "b", "c"] for province in provinces: time.sleep(1) #added but still no cigar url = f"https://www.acme.com/{province}/" print("======== ",url) response = requests.get(url) soup = html.fromstring(response.text) #BUG time-out? count = soup.xpath("//div[contains(@class,'properties-count')]/text()") print(count) count = pattern_count.search(count[0]) if count: print("Number of locations:",count) locations = soup.xpath("//div[contains(@class,'other-location-box')]/a/@href") for location in locations: time.sleep(1) #added but still no cigar print(location) response = requests.get(location) soup = html.fromstring(response.text) #BUG time-out? count = soup.xpath("//div[contains(@class,'properties-count')]/text()") print(count) count = pattern_count.search(count[0]) if not count: print("Number of properties not found") break #next location else: print("Number of properties found",count.group(1)) #grab what's in current, first page grab_properties() #If > 30, must update URL and loop through pages by groups of 30 count = int(count.group(1)) for index in range (2,math.ceil(count/30)+1): time.sleep(1) #added but still no cigar url = f"{location}p/{index}/" #new URL response = requests.get(url) soup = html.fromstring(response.text) grab_properties() ========== Am I using the wrong syntax to grab the numbers? Thank you. _______________________________________________ lxml - The Python XML Toolkit mailing list -- lxml@python.org To unsubscribe send an email to lxml-le...@python.org https://mail.python.org/mailman3/lists/lxml.python.org/ Member address: arch...@mail-archive.com