This my code:
# -*- coding:utf-8 -*-
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request, Response
from scrapy.http import FormRequest
from edu.items import EduItem
import MySQLdb
import time
class XdfSpider(CrawlSpider):
name = 'xdf'
allowed_domains = ['xdf.cn']
start_urls = ['http://souke.xdf.cn/Category/1.html']
def __init__(self):
self.page = []
self.db = MySQLdb.connect(host='###',user='###',
passwd='###',db='###',charset='###')
CrawlSpider.__init__(self)
def start_requests(self):
reqs = []
for item in self.start_urls:
reqs.append(Request(item,meta={'one':item}))
return reqs
xpath1 = '//dl[@class="course"]/dt/a'
r1 = r'1-324-0-0.html?v=5'
xpath2 = '//h2/a'
r2 = r'1-6753.html?v=5'
rules = (
Rule(SgmlLinkExtractor(restrict_xpaths=xpath1,unique=True),process_request='checkurl_1',follow=True),
Rule(SgmlLinkExtractor(restrict_xpaths=xpath2,unique=True),callback='parse_item',follow=False),
)
def checkurl_1(self, request):
request.meta['two'] = request.url
return request
def getcont(self, tab):
if tab == []:
return ''
else:
return tab[0].strip().encode('utf8')
def getconts(self, tab):
if tab == []:
return ''
else:
result = '~'.join(item.strip().encode('utf8') for item in tab)
return result
#时间
def now(self):
t = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
return t
#插入数据库
def intodb(self, database, tab):
sqlString = 'insert into edu(title,number,etime,eplace,ecount,'\
'eprice,estatus,url_1,url_2,url_3,create_time,area_code,'\
'area_name,area_add) values("%s","%s","%s","%s","%s","%s",'\
'"%s","%s","%s","%s","%s","%s","%s","%s");'
cursor = database.cursor()
cursor.execute(sqlString%tuple(tab))
database.commit()
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
#课程名字
x0 = '//div[@class="box2"]//h2/text()'
title = hxs.select(x0).extract()
title = self.getcont(title)
x1 = '//div[@id="course_class_list"]//div[@class="tabCont"]//ul'
length = len(hxs.select(x1))
for item in range(length):
.......................
print response.meta
I want to store the urls, one is the start_url secode is the rule_1's
request_url, so I used meta, but it can not works well , in the parse_item,
I print the response.meta, but there is no meta['one'] and no meta['tow']?
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/groups/opt_out.