restrict_xpath should select an area where you want links to be extracted
from.
Try using
restrict_xpaths=('//a[@id="seguinte"]')), callback='parsePage', follow=True
),)
instead, that is without the attribute path
On Monday, January 13, 2014 6:12:50 PM UTC+1, ajrpc wrote:
>
> Thank you very much Paul.
>
> I've changed Rules callback to 'parsePage' and renamed def parse to def
> parsePage and now it doesnt enter in the parsePage(), it does nothing. Now
> the code looks like:
>
> from scrapy.contrib.spiders import CrawlSpider, Rule
> from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
> from scrapy.selector import Selector
> from scrapy import log
> from urlparse import urlparse
> from urlparse import urljoin
> from scrapy.http import Request
>
> class MySpider(CrawlSpider):
> name = 'testes2'
> allowed_domains = ['hoteis.pt']
> start_urls = [
> 'http://www.hoteis.pt/pesquisa/filtro/?tipo=0&local=0'
> ]
>
> rules =
> (Rule(SgmlLinkExtractor(restrict_xpaths=('//a[@id="seguinte"]/@href')),
> callback='parsePage',follow=True),)
>
> def parsePage(self, response):
> sel = Selector(response)
> urls = sel.xpath('//div[@id="btReserve"]/../@href').extract()
> for url in urls:
> url = urljoin(response.url, url)
> self.log('URLS: %s' % url)
> yield Request(url, callback = self.parseLinks)
>
> def parseLinks(self, response):
> sel = Selector(response)
> titulo = sel.xpath('h1/text()').extract()
> morada = sel.xpath('//div[@class="MORADA"]/text()').extract()
> email = sel.xpath('//a[@class="sendMail"][1]/text()')[0].extract()
> url = sel.xpath('//div[@class="contentContacto
> sendUrl"]/a/text()').extract()
> telefone =
> sel.xpath('//div[@class="telefone"]/div[@class="contentContacto"]/text()').extract()
> fax =
> sel.xpath('//div[@class="fax"]/div[@class="contentContacto"]/text()').extract()
> descricao =
> sel.xpath('//div[@id="tbDescricao"]/p/text()').extract()
> gps = sel.xpath('//td[@class="sendGps"]/@style').extract()
>
> print titulo, email, morada
>
> On Monday, January 13, 2014 4:56:11 PM UTC, Paul Tremberth wrote:
>>
>> Hi,
>> I just replied to your StackOverflow question also.
>> One problem is that you should not override CrawlSpider's parse method,
>> otherwise the default behaviour following rules and everything will not
>> happen.
>>
>> /Paul.
>>
>> On Monday, January 13, 2014 12:13:24 PM UTC+1, ajrpc wrote:
>>>
>>> Hello,
>>>
>>> I have set Rules to get the next pages from the start_url, but it's not
>>> working, it only crawls the start_urls page, and the links in that page
>>> (with parseLinks). It doesn't go to the next page set in Rules.
>>>
>>> any help ?
>>>
>>> from scrapy.contrib.spiders import CrawlSpider, Rule
>>> from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
>>> from scrapy.selector import Selector
>>> from scrapy import log
>>> from urlparse import urlparse
>>> from urlparse import urljoin
>>> from scrapy.http import Request
>>>
>>> class MySpider(CrawlSpider):
>>> name = 'testes2'
>>> allowed_domains = ['hoteis.pt']
>>> start_urls = [
>>> 'http://www.hoteis.pt/pesquisa/filtro/?tipo=0&local=0'
>>> ]
>>>
>>> rules =
>>> (Rule(SgmlLinkExtractor(restrict_xpaths=('//a[@id="seguinte"]/@href')),
>>> follow=True),)
>>>
>>> def parse(self, response):
>>> sel = Selector(response)
>>> urls = sel.xpath('//div[@id="btReserve"]/../@href').extract()
>>> for url in urls:
>>> url = urljoin(response.url, url)
>>> self.log('URLS: %s' % url)
>>> yield Request(url, callback = self.parseLinks)
>>>
>>> def parseLinks(self, response):
>>> sel = Selector(response)
>>> titulo = sel.xpath('h1/text()').extract()
>>> morada = sel.xpath('//div[@class="MORADA"]/text()').extract()
>>> email =
>>> sel.xpath('//a[@class="sendMail"][1]/text()')[0].extract()
>>> url = sel.xpath('//div[@class="contentContacto
>>> sendUrl"]/a/text()').extract()
>>> telefone =
>>> sel.xpath('//div[@class="telefone"]/div[@class="contentContacto"]/text()').extract()
>>> fax =
>>> sel.xpath('//div[@class="fax"]/div[@class="contentContacto"]/text()').extract()
>>> descricao =
>>> sel.xpath('//div[@id="tbDescricao"]/p/text()').extract()
>>> gps = sel.xpath('//td[@class="sendGps"]/@style').extract()
>>>
>>> print titulo, email, morada
>>>
>>
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/groups/opt_out.