Hello,
As part of an introductory course of computational neuroscience, we
learned the basics of NLTK to analyze wikileaks.
On my own, I tried PyPy 1.9 (under Ubuntu 12.04 64-bits) and a simple
MapReduce scheme as an attempt to improve performance. There are 14266
files under "cable", adding up to 1.2GB. It can be downloaded as a 30MB
compressed 7z here:
http://www.dc.uba.ar/materias/incc/practicas/p2/nltk/wikis.7z
The results are:
$ time python test_mapreduce.py
170686
python test_mapreduce.py 1897.59s user 13.10s system 338% cpu 9:24.29 total
$ time ~/Downloads/pypy-1.9/bin/pypy test_mapreduce.py
170685
~/Downloads/pypy-1.9/bin/pypy test_mapreduce.py 573.78s user 15.64s
system 170% cpu 5:46.41 total
I find it strange that PyPy is using (about) 4 times less CPU than
CPython, while only taking (about) half the time. Watching the CPU usage
of my 4 cores confirms it: approximately half of the available cycles
aren't used (sometimes it seems only 2 cores are used). As I'm not
running another process that consumes them, I suspect PyPy is blocking
for some reason (i.e. removed from the scheduling queue by waiting, or
some other system call). It didn't improve by using 8 processes instead
of 4.
Do you think there is a problem with my code (actually, I'm new to Python)?
Thanks in advance,
Alejandro
P.S.: please CC me because I'm not subscribed.
import nltk
from BeautifulSoup import BeautifulStoneSoup
import os
import fnmatch
#import numpypy as numpy
import multiprocessing as mp
import zipfile
def find_files(startdir, pattern):
"""
Busca todos los archivos recursivamente a partir de 'startdir',
que concuerden con 'pattern' y los devuelve en una lista.
Por ejemplo:
find_files('wikis/cable', '*BUENOSAIRES*.html')
"""
matches = []
for root, dirnames, filenames in os.walk(startdir):
for filename in fnmatch.filter(filenames, pattern):
matches.append(os.path.join(root, filename))
return matches
def extract_text(string):
"""
Lee un achivo HTML de WikiLeaks (los "cable"), saca los tags y caracteres raros,
dejando el texto listo para tokenizar.
Por ejemplo:
with open('66BUENOSAIRES2481.html') as f:
texto = extract_text(f.read())
tokens = nltk.word_tokenize(texto)
print ' '.join(tokens)
"""
ind = string.find("<table class='cable'>")
if ind==-1:
raise Exception('el archivo no es un wikileak')
sin_header = string[ind:]
sin_html = nltk.clean_html(sin_header)
sin_codigos_feos = BeautifulStoneSoup(sin_html, convertEntities=BeautifulStoneSoup.XML_SPECIAL_CHARS_TO_ENTITIES).contents[0]
sin_codigos_feos = sin_codigos_feos.lower().replace('
','\n')
return sin_codigos_feos
def mapreduce_single(cosas, func_map, func_reduce):
rs = map(func_map, cosas)
return reduce(func_reduce, rs)
class ReduceInside():
def __init__(self, f):
self.func = f
def __call__(self, ls):
return reduce(self.func, ls)
def mapreduce_multi(cosas, func_map, func_reduce):
def splitter(l, n):
cs = len(l) / n
for i in xrange(0, len(l), cs):
yield l[i:i+cs]
pool = mp.Pool()
rs = pool.map(func_map, cosas)
sep = list(splitter(rs, mp.cpu_count()))
tmp = pool.map(ReduceInside(func_reduce), sep)
return reduce(func_reduce, tmp)
def func_map(x):
with open(x, 'r') as f:
raw = extract_text(f.read())
ps = nltk.wordpunct_tokenize(raw)
return set(ps)
def func_reduce(old, new):
return old.union(new)
def test1():
files = find_files('wikis/cable', '*.html')
words = mapreduce_multi(files, func_map, func_reduce)
print len(words)
class MapWithZip():
def __init__(self, zf):
self.zf = zf
def __call__(self, x):
with self.zf.open(x, 'r') as f:
raw = extract_text(f.read())
ps = nltk.wordpunct_tokenize(raw)
return set(ps)
def test2():
zf = zipfile.ZipFile("wikis.zip", "r")
files = fnmatch.filter(zf.namelist(), "wikis/cable/*.html")
words = mapreduce_multi(files, MapWithZip(zf), func_reduce)
print len(words)
if __name__ == '__main__':
test1()
_______________________________________________
pypy-dev mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-dev