Hi All, I have spider code inwhich i want to scrap All pages data. But my data is extracting from second page, it is skip first page data. please solve my problem. my spider code is:
import re import sys from string import join from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from scrapy.http import Request from ebay.items import EbayItem from urlparse import urlparse class InfojobsSpider(CrawlSpider): name = "ebaytest" allowed_domains = ["www.ebay.com"] start_urls = [ "http://www.ebay.com/sch/m.html?_ssn=lugnutguys&_ipg=200&rt=nc" ] rules = ( Rule(SgmlLinkExtractor(allow=(r'sch\/m\.html'),restrict_xpaths=('//a[@class="gspr next"]')), callback='parse_item', follow=True), ) #def parse_item(self, response): items=[] hxs = HtmlXPathSelector(response) titles = hxs.select('//table[contains(@class,"li rsittlref")]//tr') for titleurl in titles: item= EbayItem() item['titlename']=titleurl.select('td[@class="dtl dtlsp"]//h3/a/text()').extract() item['titleurl']= titleurl.select('td[@class="dtl dtlsp"]//h3/a/@href').extract() testdata=item['titleurl'] for data2 in testdata: preg=re.search('.+\/(.*?)\?',data2) if preg: item['upc']=preg.group(1) else: print "not found" item['price']=titleurl.select('td[@class="prc"]//div[@class="g-b"]/span/text()').extract() items.append(item) return items parse_start_urls=parse_item -- You received this message because you are subscribed to the Google Groups "scrapy-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to scrapy-users+unsubscr...@googlegroups.com. To post to this group, send email to scrapy-users@googlegroups.com. Visit this group at http://groups.google.com/group/scrapy-users. For more options, visit https://groups.google.com/d/optout.