>
> item.py
>
 

class RuelalaDealItem(Item):
>     deal_url = Field()
>     category = Field()
>     products = Field()
> class RuelalaProductItem(Item):
>     product_name = Field()
>     regular_price = Field() 
>     sales_price = Field()
>     category = Field()
>     url = Field()


crawler.py 

 

import json 

import time
> from scrapy import signals
> from scrapy.http.request import Request
> from scrapy.spider import Spider 
> from scrapy.xlib.pydispatch import dispatcher
> from crawler.items import RuelalaProductItem, RuelalaDealItem
> class SearchSpider(Spider):
>     name = 'ruelala_search_test'
>     allowed_domains = ["ruelala.com"]
>     count = 0
>     
>     start_urls = [
>         'https://www.ruelala.com/',
>     ]
>     def __init__(self, *args, **kwargs):
>         try :
>             super(SearchSpider, self).__init__(*args, **kwargs)
>             dispatcher.connect(self.spider_opened, signals.spider_opened)
>             dispatcher.connect(self.spider_closed, signals.spider_closed)
>         except :
>             pass
>     def parse(self, response):
>         try :
>             urls = []
>             urls.append('https://www.ruelala.com/boutique/80765/')
>             urls.append('https://www.ruelala.com/boutique/73831/')
>             urls.append('https://www.ruelala.com/boutique/80764/')
>             urls.append('https://www.ruelala.com/boutique/79710/')
>             item = RuelalaDealItem()
>             for url in urls:
>                 item['category'] = 'women'
>                 item['deal_url'] = url
>                 yield Request(url, callback=self.product_list_test, 
> meta=dict(item=item))
>         except :
>             pass
>     def product_list_test(self, response):
>         try :
>             item = response.meta['item']
>             data = response.css('body > script:nth-child(13)').extract()
>             for next in data:
>                 if 'productListData' in next:
>                     start = next.index('productListData:')
>                     end = next.index('boutiqueName:')
>                     str_json = next[start:end].strip()
>                     str_json = str_json[21:-1]
>                     products = json.loads(str_json)
>                     pro_list = []
>                     for product in products:
>                         url = '%s%s' % ('https://www.ruelala.com', 
> product['linkDetail'])
>                         request = Request(url, 
> callback=self.product_detail_test, meta=dict(item=item))
>                         pro_list.append(request)
>                     print dir (request)
>                     print request.body
>                     item['products'] = pro_list
>                     yield item
>         except :
>             pass
>     def product_detail_test(self, response):
>         try :
>             item = response.meta['item']
>             product = RuelalaProductItem()
>             product['category'] = item['category']
>             product['product_name'] = 
> response.xpath('//h1[@class="product-detail-name"]/text()').extract()[0]
>             try :
>                 product['regular_price'] = 
> response.xpath('//article[@class="product-detail 
> non-exp-product-detail"]/aside/p/del/text()').extract()[0]
>             except IndexError: 
>                 product['regular_price'] = -1
>             product['sales_price'] = 
> response.xpath('//article[@class="product-detail 
> non-exp-product-detail"]/aside/p/ins/text()').extract()[0]
>             product['url'] = response.url
>             self.count += 1
>             yield product
>         except :
>             pass
>     def spider_opened(self, spider):
>         try :
>             self.start_time = time.time()
>         except : 
>             pass
>     def spider_closed(self, spider):
>         try :
>             interval = time.time() - self.start_time
>             self.log('\ntotal time:%f total count:%d one:%f' % (interval, 
> self.count, interval/self.count))
>         except : 
>             pass


start scrapy "scrapy crawl ruelala_search_test -o item.json -t json"
output sample file

> {'category': 'women',
>  'deal_url': 'https://www.ruelala.com/boutique/79710/',
>  'products': [<GET 
> https://www.ruelala.com/boutique/product/73831/11223255/>,
>               <GET 
> https://www.ruelala.com/boutique/product/73831/11223254/>,
>               <GET 
> https://www.ruelala.com/boutique/product/73831/12905415/>,
>               ...
>               ...
>               <GET 
> https://www.ruelala.com/boutique/product/73831/11223269/>,
>               <GET 
> https://www.ruelala.com/boutique/product/73831/12883524/>,
>               <GET 
> https://www.ruelala.com/boutique/product/73831/11223249/>]}


but i want output sample  

> {"category": "women", },
> {'category': 'women',
>  'deal_url': 'https://www.ruelala.com/boutique/79710/',
>  'products': [
>               {"url": 
> "https://www.ruelala.com/boutique/product/79710/12887391/";, "product_name": 
> "Giorgio Armani Light Pink Tank", "sales_price": "$79.99", "regular_price": 
> "$395.00"},
>               {"url": 
> "https://www.ruelala.com/boutique/product/$OTHER_URL1/";, "product_name": 
> "$OTHER_NAME1", "sales_price": "$19.99", "regular_price": "$395.00"},
>               {"url": 
> "https://www.ruelala.com/boutique/product/$OTHER_URL2/";, "product_name": 
> "$OTHER_NAME2", "sales_price": "$29.00", "regular_price": "$395.00"},
>               {"url": 
> "https://www.ruelala.com/boutique/product/$OTHER_URL3/";, "product_name": 
> "$OTHER_NAME3", "sales_price": "$9.99", "regular_price": "$25.00"},
>               {"url": 
> "https://www.ruelala.com/boutique/product/$OTHER_URL4/";, "product_name": 
> "$OTHER_NAME4", "sales_price": "$5.99", "regular_price": "$95.00"},
> ]}

-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

Reply via email to