Uhmmmm... bueno aquí coloco el código con las pequeñas modificaciones que he
hecho para probar:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re, collections

def words(text):
    text.lower()
    return re.findall('\w+', 'í'.decode('utf-8'), re.UNICODE) #Para probar
si reconoce las palabras con 'í'

def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model

NWORDS = train(words(file('Diccionario.txt').read()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'

def edits1(word):
   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
   deletes    = [a + b[1:] for a, b in splits if b]
   transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
   replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
   inserts    = [a + c + b     for a, b in splits for c in alphabet]
   return set(deletes + transposes + replaces + inserts)

def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in
NWORDS)

def known(words): return set(w for w in words if w in NWORDS)

def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word)
or [word]
    key=NWORDS.get
    print candidates
    print key

if __name__=='__main__':
    word= raw_input('Palabra en español: ')
    correct(word)

Y esta es la salida:

Palabra en español: día
['d\xc3\xada']
<built-in method get of collections.defaultdict object at 0xb773f89c>

Y según lo que me explica Rolando no debería ser así verdad? :(...
Diccionario.txt no está en utf-8.

No tengo mucha experiencia con esto :(... qué puedo hacer?

Gracias! :)
_______________________________________________
Python-es mailing list
Python-es@python.org
http://mail.python.org/mailman/listinfo/python-es
FAQ: http://python-es-faq.wikidot.com/

Responder a