Hello Everyone,

I am new to Scrapy, so bare with me please.

I was able to crawl several webpages in order to get the data that i needed 
by feeding the urls directly into the start_urls variable. Thousands even.

But now i have a crawler that I feed about 30.000 urls (same domain) and it 
stops after crawling only about 100. And I just do not know why! :( 

I tried to google it, and searching, but could not find a solution. So I 
was hoping that someone could shed a bit of light and help me on my problem.

Here is the script:

#!/usr/bin/python -tt
# -*- coding: utf-8 -*-
import MySQLdb
import logging
import datetime
import os
import time
from scrapy.spider import Spider
from scrapy.selector import Selector

import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor

from dirbot.items import Website
now = datetime.datetime.now()


LOG_FILENAME = os.environ['CRAWLER_LOG']+'PRINFOR_4_date_'+ 
str(now.day)+'_'+str(now.month)+'_'+str(now.year)+'_hour_'+str(now.hour)+'_'+str(now.minute)+'.log'
logging.basicConfig(filename=LOG_FILENAME,level=logging.DEBUG)
    
db=MySQLdb.connect(host=os.environ['CRAWLER_HOST'],user=os.environ['CRAWLER_USER'],passwd=os.environ['CRAWLER_PASS'],
 
db=os.environ['CRAWLER_DB'])
cur=db.cursor()


class DmozSpider(Spider):
                   
    name = "dmoz_prinfor_4"

    allowed_domains = ["prinfor.pt/"]

        start_urls = [""]

    #Feeds the link into the crawler
    select = ("select distinct(link) from main_links where loja_link_id=19 
and type=%s")
    data_select =["P"]
    cur.execute(select, data_select)
    x=0
    for row in cur.fetchall():
        if x < 1:
            start_urls[0]=row[0]
        else:
            start_urls.append(row[0])
        x=x+1
    
    
    if start_urls[0]<>'':
        select = ("delete from produtos where loja_cod_id=19")
        cur.execute(select)
        cur.execute("commit")
    
    
    def parse(self, response):

        sel = Selector(response)
        sites = sel.xpath('//body')
        items = []

        
        for site in sites:
            item = Website()

        
            item['description'] = 
site.xpath('//h2[@class="productName"]/text()').extract()
            item['price'] = 
site.xpath('//span[@id="our_price_display"]/text()').extract()
            item['ref'] = 
site.xpath('//p[@align="center"]/text()').extract()
            item['cat1'] = 
site.xpath('//span[@class="navigation_end"]/a/text()').extract()
            
            item['description'].append("")
            item['price'].append("")
            item['ref'].append("")
            item['cat1'] .append("")            

            items.append(item)
            

            
            
        item['description'][0] = item['description'][0].strip()

        item['price'][0] = item['price'][0].strip()
        item['price'][0]  = item['price'][0].replace(',','.')
        item['price'][0]  = item['price'][0].replace(' ','')
            
        item['price'][0]  = item['price'][0][:-1]
        item['ref'][0]  = item['ref'][0][12:]
        item['description'][0] = item['description'][0].encode('utf-8', 
'replace')

 
            
        select = ("select MAX(index_id) from produtos where loja_cod_id=19")
        cur.execute(select)
        for row in cur.fetchall():
            if (row[0] is None) or (not row[0]):
                index_id = 1
            else:
                index_id = int(row[0]) +1
                    
        prod_cod_id = "PRINFOR-" + str(index_id)
        string_url=str(response.url)
        
        insert = ("INSERT INTO produtos (prod_cod_id, loja_cod_id, 
index_id, act, ref_num, name, prod, price_eur, price_ori, cur_ori, link, 
disp, cat_n1, cat_n2, cat_n3, cat_n4, new) "
        "VALUES (%s, 19, %s, %s, %s, %s, %s, %s, 0, %s, %s, %s, %s, %s, %s, 
%s, %s)"
        )
            
        data = [prod_cod_id, index_id, "Y", item['ref'][0], 
item['description'][0], "", item['price'][0], "EUR" ,string_url, 'Verificar 
loja'.decode('utf-8'), item['cat1'][0], "", "", "", "Y"]
        print index_id

        try:
            cur.execute(insert, data)
            cur.execute("commit")
            logging.debug('Foi inserido o link do produto: ' + string_url)
        except MySQLdb.OperationalError:
            logging.debug('Foi dado um erro no link do produto: ' + 
string_url)
            
        
        return items


        
    logging.debug('End of LOG.')

-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

Reply via email to