Re: GSOC : Support for spiders in other languages

faisal anees Sat, 22 Feb 2014 12:16:16 -0800

Hi Shane,

Some good news here !!

Over the past few days, I have been going over the Spider , requests and 
reponse classes as you had suggested . I also tried to again understand 
hadoop streaming. I have decided to do some changes in my idea of the 
implementation . Instead of files, I will be passing data using pipes by 
forking a process as was originally suggested in the Ideas page . 

I have implemented some basic functionality, wherein we can create a spider 
and set the domain and start_url properties . For now the "other" language 
is python, but this other language can easily be anything else , as i am 
only doing writes and reads to stdout and stdin.

The structure of the program is this :

   - On the terminal you will type this 
      - >> scrapy crawl streaming -a 
      Input=/home/faisal/Dropbox/PROGRAMS/SCRAPY/sandbox/INPUT.py -a 
      Output=/home/faisal/Dropbox/PROGRAMS/SCRAPY/sandbox/OUPUT.py
   - Here /home/faisal/Dropbox/PROGRAMS/SCRAPY/sandbox/INPUT.py is a 
   python(can be any language) file, which sets the  domain and start_url 
   properties.
   - Here  /home/faisal/Dropbox/PROGRAMS/SCRAPY/sandbox/OUPUT.py  is a 
   python file , which will read a json file which is a response as created by 
   the parse method of my spider
   - streaming.py is the spider

What do you think about this ?

Thanks,
Faisal

On Tuesday, February 18, 2014 4:34:19 PM UTC+4, shane wrote:
>
>
>>
>> from json import loads, dumps
>> # equivalent of scrapy start_requests() - the initial requests
>> start_request = dict(method='GET', url='http://...')
>> print dumps(start_request)
>>
>> for line in sys.stdin:
>>
>> response = loads(line)
>>
>> print dumps(parse_item(data))
>>
>> print dumps(parse_links_to_follow(data))
>>
>>
> sorry, this should obviously be :
>
> print dumps(parse_item(response))
>
> print dumps(parse_links_to_follow(response))
>
>
>
>

-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/groups/opt_out.

import sys
from json import dumps
from time import sleep

start_request = dict(method = 'GET', url = "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/";, domain="dmoz.org")

sys.stdout.write(dumps(start_request))

import sys
from json import loads

print "THIS IS THE OUTPUT"

for line in sys.stdin:
	print loads(line)

from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.item import Item, Field
import sys
from subprocess import PIPE, Popen
from threading  import Thread
from json import loads,dumps
from time import sleep
try:
    from Queue import Queue, Empty
except ImportError:
    from queue import Queue, Empty  # python 3.x

ON_POSIX = 'posix' in sys.builtin_module_names

def enqueue_output(out, queue):
    for line in iter(out.readline, b''):
        queue.put(line)
    out.close()

class StreamingItem(Item):
	title = Field()
	link = Field()
	desc = Field()

Output_file = None

class StreamingSpider(Spider):
    name = "streaming"
    allowed_domains = []
    start_urls = (
        )
    def __init__(self, Input=None, Output=None, *args, **kwargs):
		super(StreamingSpider, self).__init__(*args, **kwargs)
		p = Popen(['python', str(Input)], stdout=PIPE, bufsize=1, close_fds=ON_POSIX)
		q = Queue()
		t = Thread(target=enqueue_output, args=(p.stdout, q))
		t.daemon = True # thread dies with the program
		t.start()
		global Output_filename
		Output_filename = Output
		# ... do other things here

		# read line without blocking
		sleep(5)
		try:  line = q.get_nowait() # or q.get(timeout=.1)
		except Empty:
		    print('no output yet')
		else: # got line
		    # ... do something with line
		    res = loads(line)
		    self.allowed_domains.append(res["domain"])
		    self.start_urls = self.start_urls+(res["url"],)


    def parse(self, response):
    	sel = Selector(response)
        sites = sel.xpath('//ul/li')
        items = []
        for site in sites:
            item = {}
            item['title'] = str(site.xpath('a/text()').extract())
            item['link'] = str(site.xpath('a/@href').extract())
            item['desc'] = str(site.xpath('text()').extract())
            #print str(site.xpath('a/text()').extract())
            items.append(item)
        #print dumps(items)
        #return items
        print len(items)
        json_string = dumps(items)
        global Output_filename
        proc = Popen(['python', str(Output_filename)], stdin=PIPE)
        #proc.stdin.write(json_string)
        proc.communicate(json_string)

        '''json_dict = loads(json_store)
        print type(json_dict)'''
        pass

Re: GSOC : Support for spiders in other languages

Reply via email to