Scrapy not going through the for loop

bassamfarooq Mon, 30 Jan 2017 11:38:08 -0800

Here is the scrapy code i am using. The for loop needs to go through all 
<li> tags. And check that specific xpath for the presence of the anchor 
tag. But the loop stops at the first iteration of the for loop.


import scrapy
from shinjukuproject.items import HandlingStoreInfo
from w3lib.html import remove_tags

class ShijukuHandlingStores(scrapy.Spider):
    name = "singlepagestores"
    start_urls = [
'https://suumo.jp/chintai/tokyo/sc_shinjuku/jnc_000013357603/']
    
    
    
            
            
    def parse(self, response):
        
        for li in response.xpath('/html/body/div[5]/div[6]/ul//li'):
        
            
            hsurl = li.xpath(
'.//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-object"]/div[@class="itemcassette_img"]/div[@class="itemcassette_img-desc"]/a/@href'
).extract_first()
            
            if(hsurl):
                item = HandlingStoreInfo()
                item['Room_ID'] = response.xpath(
'/html/head/link[@rel="canonical"]/@href').re('\w+\_\d+')
                request = scrapy.Request(response.urljoin(hsurl), callback=
self.parse_storeinfo, dont_filter = True)
                request.meta['item'] = item
                return request
            else:
                item = HandlingStoreInfo()
                item['Room_ID'] = response.xpath(
'/html/head/link[@rel="canonical"]/@href').re('\w+\_\d+')
                hsn = li.xpath(
'//div[@class="itemcassette"]/div[@class="itemcassette-header"]/span[@class="itemcassette-header-ttl"]/text()'
).extract_first('Null').strip()
                item['Handling_Store_Name'] = remove_tags(hsn)
                item['Handling_Store_id'] = item['Room_ID']
                item['Location'] = li.xpath(
'//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell01"]/text()'
).extract_first('Null').strip()
                item['Transportation_Facilities'] = "N/A"
                contact = li.xpath(
'//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell04"]/span/text()'
).re('\d+\-\d+\-\d+')
                item['Contact'] = remove_tags(contact)
                item['Fax'] = "N/A"
                bh = li.xpath(
'//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell02"]/text()'
).extract_first(' ').strip()
                item['Buisiness_Hours'] = remove_tags(bh)
                rh = li.xpath(
'//div[@class="itemcassette"]/div[@class="itemcassette-body"]/div[@class="itemcassette-body-contents"]/div[@class="itemcassette_matrix"]/div[@class="itemcassette_matrix-cell03"]/text()'
).extract_first(' ').strip()
                item['Regular_Holidays'] = remove_tags(rh)
                item['License_Number'] = "N/A"
                item['Store_Characteristics'] = "N/A"
                return item        
                
                
    def parse_storeinfo(self, response):
        
        
        item = response.meta['item']
        item['Handling_Store_Name'] = response.css('html 
body.chintai.ch_leaf div#wrapper div#contents.ch-shdt h1::text').
extract_first(' ').strip()
        item['Handling_Store_id'] = response.css('html head 
link[rel=canonical]::attr(href)').re('\w+\_\d+\_\d+')
        item['Location'] = response.css('div#wrapper div#contents.ch-shdt 
div.section table.data_table.table_gaiyou tr:nth-of-type(1) 
td:nth-of-type(1)::text').extract_first(' ').strip()
        item['Transportation_Facilities'] = response.css('div#wrapper 
div#contents.ch-shdt div.section table.data_table.table_gaiyou 
tr:nth-of-type(1) td:nth-of-type(2) ul li::text').extract_first()
        item['Contact'] = response.css('html body.chintai.ch_leaf 
div#wrapper div#contents.ch-shdt div.section table.data_table.table_gaiyou 
tr:nth-of-type(2) td:nth-of-type(1) span.col-notice em::text').extract_first
(' ').strip()
        item['Fax'] = response.css('html body.chintai.ch_leaf div#wrapper 
div#contents.ch-shdt div.section table.data_table.table_gaiyou 
tr:nth-of-type(2) td:nth-of-type(2)::text').re('\d+\-\d+\-\d+')
        bh = response.css('html body.chintai.ch_leaf div#wrapper 
div#contents.ch-shdt div.section table.data_table.table_gaiyou 
tr:nth-of-type(3) td:nth-of-type(1)::text').extract_first(' ').strip()
        item['Buisiness_Hours'] = remove_tags(bh)
        rh = response.css('html body.chintai.ch_leaf div#wrapper 
div#contents.ch-shdt div.section table.data_table.table_gaiyou 
tr:nth-of-type(3) td:nth-of-type(2)::text').extract_first(' ').strip()
        item['Regular_Holidays'] = remove_tags(rh)
        item['License_Number'] = response.css('html body.chintai.ch_leaf 
div#wrapper div#contents.ch-shdt div.section table.data_table.table_gaiyou 
tr:nth-of-type(4) td:nth-of-type(2)::text').extract_first(' ').strip()
        item['Store_Characteristics'] = response.css('html 
body.chintai.ch_leaf div#wrapper div#contents.ch-shdt div.section 
table.data_table.table_gaiyou tr:nth-of-type(5) td:nth-of-type(1)::text').
extract_first(' ').strip()
        return item            




-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to scrapy-users+unsubscr...@googlegroups.com.
To post to this group, send email to scrapy-users@googlegroups.com.
Visit this group at https://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

Scrapy not going through the for loop

Reply via email to