Hello,

Thanks for a very interesting project.

In case anyone's interested or can help me improve it, I thought I'd post 
this here. It's a primitive script for turning the Gnu GCIDE dictionary 
files

http://gcide.gnu.org.ua/download

into a Mnemosyne card set. Unpack the CIDE.* files into a directory, run 
this Python 3 script in that directory, and then merge the resultant 
textfiles

$ cat CIDE.* > gcide.txt

Among other problems, I haven't figured out what all the diacritics and 
ligatures mean yet.

-- 
You received this message because you are subscribed to the Google Groups 
"mnemosyne-proj-users" group.
To post to this group, send email to [email protected].
To unsubscribe from this group, send email to 
[email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msg/mnemosyne-proj-users/-/rrCsSB0-XGIJ.
For more options, visit https://groups.google.com/groups/opt_out.


import os
import glob

def clean(buff):

    buff = buff.replace("\t", "<br>")
    buff = buff.replace("<note>", "<note>Note:")
    buff = buff.replace("<ldquo/", '\"')
    buff = buff.replace("<rdquo/", '\"')
    buff = buff.replace("<eacute/", '?')
    buff = buff.replace("<acir/", '?')
    buff = buff.replace("<ae/", 'ae')

    buff = buff.replace("<hand/>", "")
    buff = buff.replace("<hand/", "")
    buff = buff.replace("<br/", "")
    buff = buff.replace("(<?/)", "")
    buff = buff.replace("<?/", "")
    buff = buff.replace("\\'d8", "")
    buff = buff.replace("<img", "")
    buff = buff.replace("</img>", "")
    buff = buff.replace("src='images\\", "")

    buff = buff.replace("</hw>,", "</hw>")
    buff = buff.replace("</pr>,", "</pr>")
    buff = buff.replace("</wf>,", "</wf>")

    buff = remove(buff, "<mhw>", "</mhw>")
    buff = remove(buff, "<hw>", "</hw>")
    buff = remove(buff, "<pr>", "</pr>")
    buff = remove(buff, "<wf>", "</wf>")
    buff = remove(buff, "<--", "-->")

    return buff

def remove(buff, startstring, endstring):

    while True:
        startindex = buff.find(startstring)
        endindex = buff[startindex+len(startstring):].find(endstring)
        if startindex == -1 or endindex == -1:
            break
        else:
            endindex = endindex + startindex + len(startstring)
            buff = buff[:startindex] + buff[endindex + len(endstring):]
        
    return buff

for file in glob.glob( os.path.join(os.getcwd(), 'CIDE.?')):

    entry = ""
    buffer = ""

    source = open( file, "r", encoding='latin1' )
    target = open( file + '.txt', "w" )

    text = source.readlines()

    count = 0
    
    for line in text:
        if line[0:8] == '<p><ent>':
            break
        else:
            count += 1

    text = text[count:]

    for line in text:
        if line[0:8] == '<p><ent>':
            if line[8:-11] != entry:
                buffer = buffer.replace('"', "'")
                buffer = clean(buffer)
                target.write(buffer)
                target.write( '\n' )
                entry = line[8:-11]
                target.write( entry + '\t' )
                buffer = ""
        else:
            buffer = buffer + line.rstrip('\n') + ' '

    buffer = clean(buffer)
    target.write(buffer)
         
    source.close()
    target.close()
    

Reply via email to