Hi Shane,
Some good news here !!
Over the past few days, I have been going over the Spider , requests and
reponse classes as you had suggested . I also tried to again understand
hadoop streaming. I have decided to do some changes in my idea of the
implementation . Instead of files, I will be passing data using pipes by
forking a process as was originally suggested in the Ideas page .
I have implemented some basic functionality, wherein we can create a spider
and set the domain and start_url properties . For now the "other" language
is python, but this other language can easily be anything else , as i am
only doing writes and reads to stdout and stdin.
The structure of the program is this :
- On the terminal you will type this
- >> scrapy crawl streaming -a
Input=/home/faisal/Dropbox/PROGRAMS/SCRAPY/sandbox/INPUT.py -a
Output=/home/faisal/Dropbox/PROGRAMS/SCRAPY/sandbox/OUPUT.py
- Here /home/faisal/Dropbox/PROGRAMS/SCRAPY/sandbox/INPUT.py is a
python(can be any language) file, which sets the domain and start_url
properties.
- Here /home/faisal/Dropbox/PROGRAMS/SCRAPY/sandbox/OUPUT.py is a
python file , which will read a json file which is a response as created by
the parse method of my spider
- streaming.py is the spider
What do you think about this ?
Thanks,
Faisal
On Tuesday, February 18, 2014 4:34:19 PM UTC+4, shane wrote:
>
>
>>
>> from json import loads, dumps
>> # equivalent of scrapy start_requests() - the initial requests
>> start_request = dict(method='GET', url='http://...')
>> print dumps(start_request)
>>
>> for line in sys.stdin:
>>
>> response = loads(line)
>>
>> print dumps(parse_item(data))
>>
>> print dumps(parse_links_to_follow(data))
>>
>>
> sorry, this should obviously be :
>
> print dumps(parse_item(response))
>
> print dumps(parse_links_to_follow(response))
>
>
>
>
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/groups/opt_out.
import sys
from json import dumps
from time import sleep
start_request = dict(method = 'GET', url = "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", domain="dmoz.org")
sys.stdout.write(dumps(start_request))
import sys
from json import loads
print "THIS IS THE OUTPUT"
for line in sys.stdin:
print loads(line)
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.item import Item, Field
import sys
from subprocess import PIPE, Popen
from threading import Thread
from json import loads,dumps
from time import sleep
try:
from Queue import Queue, Empty
except ImportError:
from queue import Queue, Empty # python 3.x
ON_POSIX = 'posix' in sys.builtin_module_names
def enqueue_output(out, queue):
for line in iter(out.readline, b''):
queue.put(line)
out.close()
class StreamingItem(Item):
title = Field()
link = Field()
desc = Field()
Output_file = None
class StreamingSpider(Spider):
name = "streaming"
allowed_domains = []
start_urls = (
)
def __init__(self, Input=None, Output=None, *args, **kwargs):
super(StreamingSpider, self).__init__(*args, **kwargs)
p = Popen(['python', str(Input)], stdout=PIPE, bufsize=1, close_fds=ON_POSIX)
q = Queue()
t = Thread(target=enqueue_output, args=(p.stdout, q))
t.daemon = True # thread dies with the program
t.start()
global Output_filename
Output_filename = Output
# ... do other things here
# read line without blocking
sleep(5)
try: line = q.get_nowait() # or q.get(timeout=.1)
except Empty:
print('no output yet')
else: # got line
# ... do something with line
res = loads(line)
self.allowed_domains.append(res["domain"])
self.start_urls = self.start_urls+(res["url"],)
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//ul/li')
items = []
for site in sites:
item = {}
item['title'] = str(site.xpath('a/text()').extract())
item['link'] = str(site.xpath('a/@href').extract())
item['desc'] = str(site.xpath('text()').extract())
#print str(site.xpath('a/text()').extract())
items.append(item)
#print dumps(items)
#return items
print len(items)
json_string = dumps(items)
global Output_filename
proc = Popen(['python', str(Output_filename)], stdin=PIPE)
#proc.stdin.write(json_string)
proc.communicate(json_string)
'''json_dict = loads(json_store)
print type(json_dict)'''
pass