On 11/03/2012 10:47 AM, Maciej Fijalkowski wrote:
>
> I guess one thing I can say is that without looking at your algorithm
> it's impossible to say.
The complete program is part of my first mail (there are two functions:
"test1" and "test2", for individual files and zip, respectively). I've
attached it again.
> PyPy will spend more time pickling and unpickling (since it's slower)
> but might be way faster at the actual processing. This might lead to
> different time reports (as the message transport time will be higher).
I see. But this still doesn't explain the difference between using many
files and one big zip. Anyways the most important issue for me is the
one mentioned in the original mail: the fact that the PyPy processes
wait half of the time, while CPython runs (i.e.: PyPy could run faster
in "wall clock" time than it is).
>
> For what is worth, maybe you should stop using multiprocessing (it's a
> giant hack) and use explicit socket-based communication? I suggest
> using something like twisted or execnet. You'll end up with a cleaner
> model and likely with a faster solution.
>
> Since the data is mostly read-only, you can also just run completely
> separate processes that mmap the same data.
>
> Cheers,
> fijal
I agree with you about multiprocessing and using explicit communication.
The main reason to do this was to show some of my colleagues the
difference between parallelism and concurrency, and that the first can
be used without the latter (even if it's implemented using it), that's
why I chose multiprocessing.

I'll try to write (or Google) a MapReduce scheme as you suggested; in
the zip case, I'll have to find a way to share cached pieces of the file
(or mmap it to memory and share across processes, as you suggested).

Thanks,
Alejandro
import nltk
from BeautifulSoup import BeautifulStoneSoup
import os
import fnmatch
#import numpypy as numpy
import multiprocessing as mp
import zipfile

def find_files(startdir, pattern):
	"""
	Busca todos los archivos recursivamente a partir de 'startdir',
	que concuerden con 'pattern' y los devuelve en una lista.

	Por ejemplo:
		find_files('wikis/cable', '*BUENOSAIRES*.html')
	"""

	matches = []
	for root, dirnames, filenames in os.walk(startdir):
		for filename in fnmatch.filter(filenames, pattern):
			matches.append(os.path.join(root, filename))
	return matches

def extract_text(string):
	"""
	Lee un achivo HTML de WikiLeaks (los "cable"), saca los tags y caracteres raros,
	dejando el texto listo para tokenizar.

	Por ejemplo:
		with open('66BUENOSAIRES2481.html') as f:
			texto = extract_text(f.read())
			tokens = nltk.word_tokenize(texto)
			print ' '.join(tokens)
	"""

	ind = string.find("<table class='cable'>")
	if ind==-1:
		raise Exception('el archivo no es un wikileak')

	sin_header = string[ind:]
	sin_html = nltk.clean_html(sin_header)
	sin_codigos_feos = BeautifulStoneSoup(sin_html, convertEntities=BeautifulStoneSoup.XML_SPECIAL_CHARS_TO_ENTITIES).contents[0]
	sin_codigos_feos = sin_codigos_feos.lower().replace('&#x000a;','\n')

	return sin_codigos_feos

def mapreduce_single(cosas, func_map, func_reduce):
	rs = map(func_map, cosas)
	return reduce(func_reduce, rs)

class ReduceInside():
	def __init__(self, f):
		self.func = f
	def __call__(self, ls):
		return reduce(self.func, ls)


def mapreduce_multi(cosas, func_map, func_reduce):
	def splitter(l, n):
		cs = len(l) / n
		for i in xrange(0, len(l), cs):
			yield l[i:i+cs]

	pool = mp.Pool()
	rs = pool.map(func_map, cosas)
	sep = list(splitter(rs, mp.cpu_count()))
	tmp = pool.map(ReduceInside(func_reduce), sep)
	return reduce(func_reduce, tmp)

def func_map(x):
	with open(x, 'r') as f:
		raw = extract_text(f.read())
		ps = nltk.wordpunct_tokenize(raw)
		return set(ps)

def func_reduce(old, new):
	return old.union(new)

def test1():
		files = find_files('wikis/cable', '*.html')
		words = mapreduce_multi(files, func_map, func_reduce)
		print len(words)

class MapWithZip():
	def __init__(self, zf):
		self.zf = zf
	def __call__(self, x):
		with self.zf.open(x, 'r') as f:
			raw = extract_text(f.read())
			ps = nltk.wordpunct_tokenize(raw)
			return set(ps)

def test2():
		zf = zipfile.ZipFile("wikis.zip", "r")
		files = fnmatch.filter(zf.namelist(), "wikis/cable/*.html")
		words = mapreduce_multi(files, MapWithZip(zf), func_reduce)
		print len(words)

if __name__ == '__main__':
	test1()
_______________________________________________
pypy-dev mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-dev

Reply via email to