Hello,

As part of an introductory course of computational neuroscience, we
learned the basics of NLTK to analyze wikileaks.

On my own, I tried PyPy 1.9 (under Ubuntu 12.04 64-bits) and a simple
MapReduce scheme as an attempt to improve performance. There are 14266
files under "cable", adding up to 1.2GB. It can be downloaded as a 30MB
compressed 7z here:
http://www.dc.uba.ar/materias/incc/practicas/p2/nltk/wikis.7z

The results are:

$ time python test_mapreduce.py
170686
python test_mapreduce.py  1897.59s user 13.10s system 338% cpu 9:24.29 total

$ time ~/Downloads/pypy-1.9/bin/pypy test_mapreduce.py
170685
~/Downloads/pypy-1.9/bin/pypy test_mapreduce.py  573.78s user 15.64s
system 170% cpu 5:46.41 total

I find it strange that PyPy is using (about) 4 times less CPU than
CPython, while only taking (about) half the time. Watching the CPU usage
of my 4 cores confirms it: approximately half of the available cycles
aren't used (sometimes it seems only 2 cores are used). As I'm not
running another process that consumes them, I suspect PyPy is blocking
for some reason (i.e. removed from the scheduling queue by waiting, or
some other system call). It didn't improve by using 8 processes instead
of 4.

Do you think there is a problem with my code (actually, I'm new to Python)?

Thanks in advance,
Alejandro

P.S.: please CC me because I'm not subscribed.
import nltk
from BeautifulSoup import BeautifulStoneSoup
import os
import fnmatch
#import numpypy as numpy
import multiprocessing as mp
import zipfile

def find_files(startdir, pattern):
	"""
	Busca todos los archivos recursivamente a partir de 'startdir',
	que concuerden con 'pattern' y los devuelve en una lista.

	Por ejemplo:
		find_files('wikis/cable', '*BUENOSAIRES*.html')
	"""

	matches = []
	for root, dirnames, filenames in os.walk(startdir):
		for filename in fnmatch.filter(filenames, pattern):
			matches.append(os.path.join(root, filename))
	return matches

def extract_text(string):
	"""
	Lee un achivo HTML de WikiLeaks (los "cable"), saca los tags y caracteres raros,
	dejando el texto listo para tokenizar.

	Por ejemplo:
		with open('66BUENOSAIRES2481.html') as f:
			texto = extract_text(f.read())
			tokens = nltk.word_tokenize(texto)
			print ' '.join(tokens)
	"""

	ind = string.find("<table class='cable'>")
	if ind==-1:
		raise Exception('el archivo no es un wikileak')

	sin_header = string[ind:]
	sin_html = nltk.clean_html(sin_header)
	sin_codigos_feos = BeautifulStoneSoup(sin_html, convertEntities=BeautifulStoneSoup.XML_SPECIAL_CHARS_TO_ENTITIES).contents[0]
	sin_codigos_feos = sin_codigos_feos.lower().replace('&#x000a;','\n')

	return sin_codigos_feos

def mapreduce_single(cosas, func_map, func_reduce):
	rs = map(func_map, cosas)
	return reduce(func_reduce, rs)

class ReduceInside():
	def __init__(self, f):
		self.func = f
	def __call__(self, ls):
		return reduce(self.func, ls)


def mapreduce_multi(cosas, func_map, func_reduce):
	def splitter(l, n):
		cs = len(l) / n
		for i in xrange(0, len(l), cs):
			yield l[i:i+cs]

	pool = mp.Pool()
	rs = pool.map(func_map, cosas)
	sep = list(splitter(rs, mp.cpu_count()))
	tmp = pool.map(ReduceInside(func_reduce), sep)
	return reduce(func_reduce, tmp)

def func_map(x):
	with open(x, 'r') as f:
		raw = extract_text(f.read())
		ps = nltk.wordpunct_tokenize(raw)
		return set(ps)

def func_reduce(old, new):
	return old.union(new)

def test1():
		files = find_files('wikis/cable', '*.html')
		words = mapreduce_multi(files, func_map, func_reduce)
		print len(words)

class MapWithZip():
	def __init__(self, zf):
		self.zf = zf
	def __call__(self, x):
		with self.zf.open(x, 'r') as f:
			raw = extract_text(f.read())
			ps = nltk.wordpunct_tokenize(raw)
			return set(ps)

def test2():
		zf = zipfile.ZipFile("wikis.zip", "r")
		files = fnmatch.filter(zf.namelist(), "wikis/cable/*.html")
		words = mapreduce_multi(files, MapWithZip(zf), func_reduce)
		print len(words)

if __name__ == '__main__':
	test1()
_______________________________________________
pypy-dev mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-dev

Reply via email to