Hello,
Thanks for a very interesting project.
In case anyone's interested or can help me improve it, I thought I'd post
this here. It's a primitive script for turning the Gnu GCIDE dictionary
files
http://gcide.gnu.org.ua/download
into a Mnemosyne card set. Unpack the CIDE.* files into a directory, run
this Python 3 script in that directory, and then merge the resultant
textfiles
$ cat CIDE.* > gcide.txt
Among other problems, I haven't figured out what all the diacritics and
ligatures mean yet.
--
You received this message because you are subscribed to the Google Groups
"mnemosyne-proj-users" group.
To post to this group, send email to [email protected].
To unsubscribe from this group, send email to
[email protected].
To view this discussion on the web visit
https://groups.google.com/d/msg/mnemosyne-proj-users/-/rrCsSB0-XGIJ.
For more options, visit https://groups.google.com/groups/opt_out.
import os
import glob
def clean(buff):
buff = buff.replace("\t", "<br>")
buff = buff.replace("<note>", "<note>Note:")
buff = buff.replace("<ldquo/", '\"')
buff = buff.replace("<rdquo/", '\"')
buff = buff.replace("<eacute/", '?')
buff = buff.replace("<acir/", '?')
buff = buff.replace("<ae/", 'ae')
buff = buff.replace("<hand/>", "")
buff = buff.replace("<hand/", "")
buff = buff.replace("<br/", "")
buff = buff.replace("(<?/)", "")
buff = buff.replace("<?/", "")
buff = buff.replace("\\'d8", "")
buff = buff.replace("<img", "")
buff = buff.replace("</img>", "")
buff = buff.replace("src='images\\", "")
buff = buff.replace("</hw>,", "</hw>")
buff = buff.replace("</pr>,", "</pr>")
buff = buff.replace("</wf>,", "</wf>")
buff = remove(buff, "<mhw>", "</mhw>")
buff = remove(buff, "<hw>", "</hw>")
buff = remove(buff, "<pr>", "</pr>")
buff = remove(buff, "<wf>", "</wf>")
buff = remove(buff, "<--", "-->")
return buff
def remove(buff, startstring, endstring):
while True:
startindex = buff.find(startstring)
endindex = buff[startindex+len(startstring):].find(endstring)
if startindex == -1 or endindex == -1:
break
else:
endindex = endindex + startindex + len(startstring)
buff = buff[:startindex] + buff[endindex + len(endstring):]
return buff
for file in glob.glob( os.path.join(os.getcwd(), 'CIDE.?')):
entry = ""
buffer = ""
source = open( file, "r", encoding='latin1' )
target = open( file + '.txt', "w" )
text = source.readlines()
count = 0
for line in text:
if line[0:8] == '<p><ent>':
break
else:
count += 1
text = text[count:]
for line in text:
if line[0:8] == '<p><ent>':
if line[8:-11] != entry:
buffer = buffer.replace('"', "'")
buffer = clean(buffer)
target.write(buffer)
target.write( '\n' )
entry = line[8:-11]
target.write( entry + '\t' )
buffer = ""
else:
buffer = buffer + line.rstrip('\n') + ' '
buffer = clean(buffer)
target.write(buffer)
source.close()
target.close()