Re: crawl next pages

Paul Tremberth Mon, 13 Jan 2014 09:22:57 -0800

restrict_xpath should select an area where you want links to be extracted 
from.
Try using
restrict_xpaths=('//a[@id="seguinte"]')), callback='parsePage', follow=True
),)


instead, that is without the attribute path

On Monday, January 13, 2014 6:12:50 PM UTC+1, ajrpc wrote:
>
> Thank you very much Paul.
>
> I've changed Rules callback to 'parsePage' and renamed def parse to def 
> parsePage and now it doesnt enter in the parsePage(), it does nothing. Now 
> the code looks like:
>
> from scrapy.contrib.spiders import CrawlSpider, Rule
> from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
> from scrapy.selector import Selector
> from scrapy import log
> from urlparse import urlparse
> from urlparse import urljoin
> from scrapy.http import Request
>
> class MySpider(CrawlSpider):
>     name = 'testes2'
>     allowed_domains = ['hoteis.pt']
>     start_urls = [
>         'http://www.hoteis.pt/pesquisa/filtro/?tipo=0&local=0'
>     ]
>
>     rules = 
> (Rule(SgmlLinkExtractor(restrict_xpaths=('//a[@id="seguinte"]/@href')), 
> callback='parsePage',follow=True),)
>
>     def parsePage(self, response):
>          sel = Selector(response)
>          urls = sel.xpath('//div[@id="btReserve"]/../@href').extract()
>          for url in urls:
>           url = urljoin(response.url, url)
>           self.log('URLS: %s' % url)
>           yield Request(url, callback = self.parseLinks)
>         
>     def parseLinks(self, response):
>     sel = Selector(response)
>         titulo = sel.xpath('h1/text()').extract()
>         morada = sel.xpath('//div[@class="MORADA"]/text()').extract()
>         email = sel.xpath('//a[@class="sendMail"][1]/text()')[0].extract()
>         url = sel.xpath('//div[@class="contentContacto 
> sendUrl"]/a/text()').extract()
>         telefone = 
> sel.xpath('//div[@class="telefone"]/div[@class="contentContacto"]/text()').extract()
>         fax = 
> sel.xpath('//div[@class="fax"]/div[@class="contentContacto"]/text()').extract()
>         descricao = 
> sel.xpath('//div[@id="tbDescricao"]/p/text()').extract()
>         gps = sel.xpath('//td[@class="sendGps"]/@style').extract()
>
>         print titulo, email, morada
>
> On Monday, January 13, 2014 4:56:11 PM UTC, Paul Tremberth wrote:
>>
>> Hi,
>> I just replied to your StackOverflow question also.
>> One problem is that you should not override CrawlSpider's parse method, 
>> otherwise the default behaviour following rules and everything will not 
>> happen.
>>
>> /Paul.
>>
>> On Monday, January 13, 2014 12:13:24 PM UTC+1, ajrpc wrote:
>>>
>>> Hello,
>>>
>>> I have set Rules to get the next pages from the start_url, but it's not 
>>> working, it only crawls the start_urls page, and the links in that page 
>>> (with parseLinks). It doesn't go to the next page set in Rules.
>>>
>>> any help ?
>>>
>>> from scrapy.contrib.spiders import CrawlSpider, Rule
>>> from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
>>> from scrapy.selector import Selector
>>> from scrapy import log
>>> from urlparse import urlparse
>>> from urlparse import urljoin
>>> from scrapy.http import Request
>>>
>>> class MySpider(CrawlSpider):
>>>     name = 'testes2'
>>>     allowed_domains = ['hoteis.pt']
>>>     start_urls = [
>>>         'http://www.hoteis.pt/pesquisa/filtro/?tipo=0&local=0'
>>>     ]
>>>
>>>     rules = 
>>> (Rule(SgmlLinkExtractor(restrict_xpaths=('//a[@id="seguinte"]/@href')), 
>>> follow=True),)
>>>
>>>     def parse(self, response):
>>>          sel = Selector(response)
>>>          urls = sel.xpath('//div[@id="btReserve"]/../@href').extract()
>>>          for url in urls:
>>>           url = urljoin(response.url, url)
>>>           self.log('URLS: %s' % url)
>>>           yield Request(url, callback = self.parseLinks)
>>>         
>>>     def parseLinks(self, response):
>>>     sel = Selector(response)
>>>         titulo = sel.xpath('h1/text()').extract()
>>>         morada = sel.xpath('//div[@class="MORADA"]/text()').extract()
>>>         email = 
>>> sel.xpath('//a[@class="sendMail"][1]/text()')[0].extract()
>>>         url = sel.xpath('//div[@class="contentContacto 
>>> sendUrl"]/a/text()').extract()
>>>         telefone = 
>>> sel.xpath('//div[@class="telefone"]/div[@class="contentContacto"]/text()').extract()
>>>         fax = 
>>> sel.xpath('//div[@class="fax"]/div[@class="contentContacto"]/text()').extract()
>>>         descricao = 
>>> sel.xpath('//div[@id="tbDescricao"]/p/text()').extract()
>>>         gps = sel.xpath('//td[@class="sendGps"]/@style').extract()
>>>
>>>         print titulo, email, morada
>>>
>>

-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/groups/opt_out.

Re: crawl next pages

Reply via email to