Hello. The following test code works but the stepping through script is wrong.
Currently, as programmed, the code first gathers links (yes, the ones i want) from ALL pages returned by each individual start_urls, proceeding through ALL start_urls without proceeding through through rest of script. This is wrong. The code should first grab a listing of links from the first start_urls (such as *all links contained in* "http://localhost/first-page") then enumerate through the rest of the script gathering name, addresslisted, link, website for links in http://localhost/first-page *only*, then repeat this process *for each individual start_urls* in a similar fashion. -------------------------------------------------------------------------------------------- import urlparse from scrapy.spider import BaseSpider from scrapy.selector import Selector from scrapy.http import Request import w3lib.url from scrapytest.items import scrapytestItem class scrapytestSpider(BaseSpider): name = "scrapytest" download_delay = 2 concurrent_requests = 4 concurrent_requests_per_domain = 1 allowed_domains = ["localhost"] start_urls = ["http://localhost/first-page", "http://localhost/second-page", "http://localhost/third-page"] def parse(self, response): selector = Selector(response) for title in selector.css("span.companyname-name"): page_url = urlparse.urljoin(response.url, title.xpath("a/@href").extract()[0]) self.log("page URL: %s" % page_url) yield Request(page_url, callback=self.parse_page) for next_page in selector.css(u'ul > li > a.prev-next:contains(\u3421)'): next_url = response.css('a.pagination-links_anchor.next::attr(href)').extract_first() if next_url: next_url = urlparse.urljoin(response.url, next_url) self.log("next URL: %s" % next_url) yield Request(next_url, callback=self.parse) def parse_page(self, response): selector = Selector(response) item = scrapytestItem() item["name"] = selector.xpath('.//h1[@itemprop="name"]/text()').extract()[0].strip() item["addresslisted"] = u"\n".join( selector.xpath('.//address[@itemprop="address"]//text()').extract()).strip() item["link"] = response.url website = selector.css('div.addresslisted-website a') if website: website_url = website.xpath('@href').extract()[0] item["website"] = w3lib.url.url_query_parameter(website_url, "url") return item -- You received this message because you are subscribed to the Google Groups "scrapy-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To post to this group, send email to [email protected]. Visit this group at https://groups.google.com/group/scrapy-users. For more options, visit https://groups.google.com/d/optout.
