Hello.

The following test code works but the stepping through script is wrong.  

Currently, as programmed, the code first gathers links (yes, the ones i 
want)  from ALL pages returned by each individual start_urls, proceeding 
through ALL start_urls without proceeding through through rest of script.

This is wrong.

The code should first grab a listing of links from the first start_urls 
(such as *all links contained in* "http://localhost/first-page";)

then

enumerate through the rest of the script gathering name, addresslisted, 
link, website for links in http://localhost/first-page *only*,

then

repeat this process *for each individual start_urls* in a similar fashion.
--------------------------------------------------------------------------------------------

import urlparse

from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request
import w3lib.url

from scrapytest.items import scrapytestItem


class scrapytestSpider(BaseSpider):
    name = "scrapytest"
    download_delay = 2
    concurrent_requests = 4
    concurrent_requests_per_domain = 1
    allowed_domains = ["localhost"]
    start_urls = ["http://localhost/first-page";,
    "http://localhost/second-page";,
    "http://localhost/third-page";]

    def parse(self, response):
        selector = Selector(response)
        for title in selector.css("span.companyname-name"):
            page_url = urlparse.urljoin(response.url, 
title.xpath("a/@href").extract()[0])
            self.log("page URL: %s" % page_url)
            yield Request(page_url, callback=self.parse_page)

        for next_page in selector.css(u'ul > li > 
a.prev-next:contains(\u3421)'):
            next_url = 
response.css('a.pagination-links_anchor.next::attr(href)').extract_first()
        if next_url:
            next_url = urlparse.urljoin(response.url, next_url)
            self.log("next URL: %s" % next_url)
            yield Request(next_url, callback=self.parse)

    def parse_page(self, response):
        selector = Selector(response)
        item = scrapytestItem()
        item["name"] = 
selector.xpath('.//h1[@itemprop="name"]/text()').extract()[0].strip()
        item["addresslisted"] = u"\n".join(
            
selector.xpath('.//address[@itemprop="address"]//text()').extract()).strip()
        item["link"] = response.url
        website = selector.css('div.addresslisted-website a')
        if website:
            website_url = website.xpath('@href').extract()[0]
            item["website"] = w3lib.url.url_query_parameter(website_url, 
"url")
        return item

-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at https://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

Reply via email to