Dumping scraped data to xml file

getbraine getbraine Sat, 05 Apr 2014 11:44:23 -0700

I have the following code, which scraped the data perfectly:

class DmozSpider(Spider):
    name = "dmoz"
    allowed_domains = ["reuters.com"]
    a  = datetime.date(2014, 04, 01)
    b =  datetime.date(2014, 04, 02)
    articles=ET.Element("articles")


    urls=["http://www.reuters.com/resources/archive/us/"; + 
dt.strftime("%Y") +
          dt.strftime("%m") + dt.strftime("%d")+".html" for dt in 
rrule(DAILY, dtstart=a, until=b)]

    def start_requests(self):
        date=" "
        for url in self.urls:
            yield 
Request(url=url,meta={'date':str(url)[-10:-4]},callback=self.parse)

    def parse(self, response):
        sel = Selector(response)
        sites = sel.xpath('//*[@id="content"]/div[2]/div/div/div[1]')
        passed_date=response.meta.get('date')
        items=[]
        for site in sites:
            item = DmozItem()
            item['title'] = site.xpath('.//div/a/text()').extract()
            item['link'] = site.xpath('.//a/@href').extract()
            item['time'] = site.xpath('.//div/text()').extract()
            item['date'] = passed_date
            items.append(item)
        return items

I would like to store the items at the xml file with the following structure

<root>
    <article_date>passed_date
    <article_time>item['time']
        <article_name>item['title']</article_name>
        <article_link>item['link']</article_link>

    </article_time>
    </article_date></root>


What I have tried to write (nothing writes to the selected file):
import xml.etree.cElementTree as ET

class TutorialPipeline(object): 
def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}
        self.exporters = {}

    def spider_opened(self, spider):
        file = open('~/Documents/test.xml', 'w+b')
        self.files[spider] = file
        self.exporters[spider] = XmlItemExporter(file)
        self.exporters[spider].start_exporting()

    def spider_closed(self, spider):
        self.exporters[spider].finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporters[spider].export_item(item)
        return item


-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to scrapy-users+unsubscr...@googlegroups.com.
To post to this group, send email to scrapy-users@googlegroups.com.
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

Dumping scraped data to xml file

Reply via email to