Re: Scrapy not going through the for loop

Jhovanny Uribe Thu, 02 Feb 2017 12:27:33 -0800

Hi you need define the next request.
Because you only set the start_url, and don't set the next url.


Check the page 4  in this pdf

https://media.readthedocs.org/pdf/scrapy/1.0/scrapy.pdf

Regards

El lunes, 30 de enero de 2017, 14:37:51 (UTC-5), bassam...@gmail.com 
escribió:
>
> Here is the scrapy code i am using. The for loop needs to go through all 
> <li> tags. And check that specific xpath for the presence of the anchor 
> tag. But the loop stops at the first iteration of the for loop.
>
> import scrapy
> from shinjukuproject.items import HandlingStoreInfo
> from w3lib.html import remove_tags
>
> class ShijukuHandlingStores(scrapy.Spider):
>     name = "singlepagestores"
>     start_urls = ['
> https://suumo.jp/chintai/tokyo/sc_shinjuku/jnc_000013357603/']
>     
>     
>     
>             
>             
>     def parse(self, response):
>         
>         for li in response.xpath('/html/body/div[5]/div[6]/ul//li'):
>         
>             
>             hsurl = li.xpath(
> './/div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-object"]/div[@class="itemcassette_img"]/div[@class="itemcassette_img-desc"]/a/@href'
> ).extract_first()
>             
>             if(hsurl):
>                 item = HandlingStoreInfo()
>                 item['Room_ID'] = response.xpath(
> '/html/head/link[@rel="canonical"]/@href').re('\w+\_\d+')
>                 request = scrapy.Request(response.urljoin(hsurl), callback
> =self.parse_storeinfo, dont_filter = True)
>                 request.meta['item'] = item
>                 return request
>             else:
>                 item = HandlingStoreInfo()
>                 item['Room_ID'] = response.xpath(
> '/html/head/link[@rel="canonical"]/@href').re('\w+\_\d+')
>                 hsn = li.xpath(
> '//div[@class="itemcassette"]/div[@class="itemcassette-header"]/span[@class="itemcassette-header-ttl"]/text()'
> ).extract_first('Null').strip()
>                 item['Handling_Store_Name'] = remove_tags(hsn)
>                 item['Handling_Store_id'] = item['Room_ID']
>                 item['Location'] = li.xpath(
> '//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell01"]/text()'
> ).extract_first('Null').strip()
>                 item['Transportation_Facilities'] = "N/A"
>                 contact = li.xpath(
> '//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell04"]/span/text()'
> ).re('\d+\-\d+\-\d+')
>                 item['Contact'] = remove_tags(contact)
>                 item['Fax'] = "N/A"
>                 bh = li.xpath(
> '//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell02"]/text()'
> ).extract_first(' ').strip()
>                 item['Buisiness_Hours'] = remove_tags(bh)
>                 rh = li.xpath(
> '//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell03"]/text()'
> ).extract_first(' ').strip()
>                 item['Regular_Holidays'] = remove_tags(rh)
>                 item['License_Number'] = "N/A"
>                 item['Store_Characteristics'] = "N/A"
>                 return item        
>                 
>                 
>     def parse_storeinfo(self, response):
>         
>         
>         item = response.meta['item']
>         item['Handling_Store_Name'] = response.css('html 
> body.chintai.ch_leaf div#wrapper div#contents.ch-shdt h1::text').
> extract_first(' ').strip()
>         item['Handling_Store_id'] = response.css('html head 
> link[rel=canonical]::attr(href)').re('\w+\_\d+\_\d+')
>         item['Location'] = response.css('div#wrapper div#contents.ch-shdt 
> div.section table.data_table.table_gaiyou tr:nth-of-type(1) 
> td:nth-of-type(1)::text').extract_first(' ').strip()
>         item['Transportation_Facilities'] = response.css('div#wrapper 
> div#contents.ch-shdt div.section table.data_table.table_gaiyou 
> tr:nth-of-type(1) td:nth-of-type(2) ul li::text').extract_first()
>         item['Contact'] = response.css('html body.chintai.ch_leaf 
> div#wrapper div#contents.ch-shdt div.section table.data_table.table_gaiyou 
> tr:nth-of-type(2) td:nth-of-type(1) span.col-notice em::text').
> extract_first(' ').strip()
>         item['Fax'] = response.css('html body.chintai.ch_leaf div#wrapper 
> div#contents.ch-shdt div.section table.data_table.table_gaiyou 
> tr:nth-of-type(2) td:nth-of-type(2)::text').re('\d+\-\d+\-\d+')
>         bh = response.css('html body.chintai.ch_leaf div#wrapper 
> div#contents.ch-shdt div.section table.data_table.table_gaiyou 
> tr:nth-of-type(3) td:nth-of-type(1)::text').extract_first(' ').strip()
>         item['Buisiness_Hours'] = remove_tags(bh)
>         rh = response.css('html body.chintai.ch_leaf div#wrapper 
> div#contents.ch-shdt div.section table.data_table.table_gaiyou 
> tr:nth-of-type(3) td:nth-of-type(2)::text').extract_first(' ').strip()
>         item['Regular_Holidays'] = remove_tags(rh)
>         item['License_Number'] = response.css('html body.chintai.ch_leaf 
> div#wrapper div#contents.ch-shdt div.section table.data_table.table_gaiyou 
> tr:nth-of-type(4) td:nth-of-type(2)::text').extract_first(' ').strip()
>         item['Store_Characteristics'] = response.css('html 
> body.chintai.ch_leaf div#wrapper div#contents.ch-shdt div.section 
> table.data_table.table_gaiyou tr:nth-of-type(5) td:nth-of-type(1)::text').
> extract_first(' ').strip()
>         return item            
>
>
>
>
>

-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to scrapy-users+unsubscr...@googlegroups.com.
To post to this group, send email to scrapy-users@googlegroups.com.
Visit this group at https://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

Re: Scrapy not going through the for loop

Reply via email to