> > item.py > class RuelalaDealItem(Item): > deal_url = Field() > category = Field() > products = Field() > class RuelalaProductItem(Item): > product_name = Field() > regular_price = Field() > sales_price = Field() > category = Field() > url = Field()
crawler.py import json import time > from scrapy import signals > from scrapy.http.request import Request > from scrapy.spider import Spider > from scrapy.xlib.pydispatch import dispatcher > from crawler.items import RuelalaProductItem, RuelalaDealItem > class SearchSpider(Spider): > name = 'ruelala_search_test' > allowed_domains = ["ruelala.com"] > count = 0 > > start_urls = [ > 'https://www.ruelala.com/', > ] > def __init__(self, *args, **kwargs): > try : > super(SearchSpider, self).__init__(*args, **kwargs) > dispatcher.connect(self.spider_opened, signals.spider_opened) > dispatcher.connect(self.spider_closed, signals.spider_closed) > except : > pass > def parse(self, response): > try : > urls = [] > urls.append('https://www.ruelala.com/boutique/80765/') > urls.append('https://www.ruelala.com/boutique/73831/') > urls.append('https://www.ruelala.com/boutique/80764/') > urls.append('https://www.ruelala.com/boutique/79710/') > item = RuelalaDealItem() > for url in urls: > item['category'] = 'women' > item['deal_url'] = url > yield Request(url, callback=self.product_list_test, > meta=dict(item=item)) > except : > pass > def product_list_test(self, response): > try : > item = response.meta['item'] > data = response.css('body > script:nth-child(13)').extract() > for next in data: > if 'productListData' in next: > start = next.index('productListData:') > end = next.index('boutiqueName:') > str_json = next[start:end].strip() > str_json = str_json[21:-1] > products = json.loads(str_json) > pro_list = [] > for product in products: > url = '%s%s' % ('https://www.ruelala.com', > product['linkDetail']) > request = Request(url, > callback=self.product_detail_test, meta=dict(item=item)) > pro_list.append(request) > print dir (request) > print request.body > item['products'] = pro_list > yield item > except : > pass > def product_detail_test(self, response): > try : > item = response.meta['item'] > product = RuelalaProductItem() > product['category'] = item['category'] > product['product_name'] = > response.xpath('//h1[@class="product-detail-name"]/text()').extract()[0] > try : > product['regular_price'] = > response.xpath('//article[@class="product-detail > non-exp-product-detail"]/aside/p/del/text()').extract()[0] > except IndexError: > product['regular_price'] = -1 > product['sales_price'] = > response.xpath('//article[@class="product-detail > non-exp-product-detail"]/aside/p/ins/text()').extract()[0] > product['url'] = response.url > self.count += 1 > yield product > except : > pass > def spider_opened(self, spider): > try : > self.start_time = time.time() > except : > pass > def spider_closed(self, spider): > try : > interval = time.time() - self.start_time > self.log('\ntotal time:%f total count:%d one:%f' % (interval, > self.count, interval/self.count)) > except : > pass start scrapy "scrapy crawl ruelala_search_test -o item.json -t json" output sample file > {'category': 'women', > 'deal_url': 'https://www.ruelala.com/boutique/79710/', > 'products': [<GET > https://www.ruelala.com/boutique/product/73831/11223255/>, > <GET > https://www.ruelala.com/boutique/product/73831/11223254/>, > <GET > https://www.ruelala.com/boutique/product/73831/12905415/>, > ... > ... > <GET > https://www.ruelala.com/boutique/product/73831/11223269/>, > <GET > https://www.ruelala.com/boutique/product/73831/12883524/>, > <GET > https://www.ruelala.com/boutique/product/73831/11223249/>]} but i want output sample > {"category": "women", }, > {'category': 'women', > 'deal_url': 'https://www.ruelala.com/boutique/79710/', > 'products': [ > {"url": > "https://www.ruelala.com/boutique/product/79710/12887391/", "product_name": > "Giorgio Armani Light Pink Tank", "sales_price": "$79.99", "regular_price": > "$395.00"}, > {"url": > "https://www.ruelala.com/boutique/product/$OTHER_URL1/", "product_name": > "$OTHER_NAME1", "sales_price": "$19.99", "regular_price": "$395.00"}, > {"url": > "https://www.ruelala.com/boutique/product/$OTHER_URL2/", "product_name": > "$OTHER_NAME2", "sales_price": "$29.00", "regular_price": "$395.00"}, > {"url": > "https://www.ruelala.com/boutique/product/$OTHER_URL3/", "product_name": > "$OTHER_NAME3", "sales_price": "$9.99", "regular_price": "$25.00"}, > {"url": > "https://www.ruelala.com/boutique/product/$OTHER_URL4/", "product_name": > "$OTHER_NAME4", "sales_price": "$5.99", "regular_price": "$95.00"}, > ]} -- You received this message because you are subscribed to the Google Groups "scrapy-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To post to this group, send email to [email protected]. Visit this group at http://groups.google.com/group/scrapy-users. For more options, visit https://groups.google.com/d/optout.
