Hi If some are interested, here is a pre/post processing script. To train : preprocess both source and target using tag-wrapper.py tag. Call train-model.perl with -phrase-word-alignment. To translate : preprocess using tag-wrapper.py tag <filename1>, then translate using moses option -T <file2>, then post-process using tag-wrapper.py detag <filename1> <filename2>. The second file: tag-wrapper, is a shell script which does all this using named fifo files, so you can just use : cat input | tag-wrapper "moses -f inifile"
The preprocessing script transforms all nums and all html-type tags (i.e. using brackets: <...>), you can add other regexps if you wish, and of course you can should remove the html-type if you use xml input for moses (modify tag-wrapper.py, towards the end). Tag names are prefixed with ⊚, so you should make sure that this character does not appear in your corpus. As it is now, it will fail if a tag is either untranslated or translated by anything else than a tag of the same name (I'm starting to think that untranslated tags should be allowed, so this might change (something like "1 or 2 ||| some" seems like an acceptable phrase)). It has not been extensively tested yet, feedback / bug reports are welcome. Cheers, -- Raphael Payen 2010/7/19 Hieu Hoang <[email protected]>: > hey raphael > > i've added the alignment info to the translation details info, which you can > get by adding a switch when running the decoder > -T [file] > the output is like: > SOURCE: [0..0] das > TRANSLATED AS: the > WORD ALIGNED: 0-0 > > it's only working in text file at the moment. I can add it to my new on-disk > file format in the next few days, or christian can tell how to hack to info > into the old binary file format. However, we shouldn't waste effort by doing > both. Opinions gratefully received. > > On 19/07/2010 20:11, Raphael Payen wrote: >> >> Hi >> >> I am testing this. I made a small change in train-model.perl to remove >> the value of the option since it is a boolean flag, the training works >> and writes word alignment. >> >> Now I would like to be able to obtain word alignment after >> translation. I tried calling moses with many options: >> -print-alignment-info -use-alignment-info >> -print-alignment-info-in-n-best -n-best-list nbestfile 3 >> But this doesn't work. Is there something I'm missing ? >> >> I just read that in this same email below: >> >>> >>> I don't have much to say about word alignments in the decoder - since >>> I've found out that it's quite easy to obtain word alignments by putting >>> the >>> alignment info in a second factor in the phrase table, I don't need >>> special >>> code in the decoder to deal with this. >>> >> >> But I don't understand how to do that. >> >> Best regards, >> >> >
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re, string,cPickle
tagsymbol= u"⊚"
def extract_tags(line, regexp, tagname, dic):
i=0
pos=0
res=[]
for match in re.finditer(regexp, line):
res.append(line[pos:match.start()])
res.append(tagsymbol+tagname)
pos = match.end()
if ((tagname,i)) in dic:
raise Exception("double tag: "+tagname+str(i))
dic[(tagname,i)] = match.group(1)
i +=1
res.append(line[pos:])
return string.join(res," ")
def restore_tags(tokensline, tagdict, places):
for tag in tagdict:
tagtype=tag[0]
tagnum=tag[1]
place=places[tagtype][tagnum][1]
#you can remove this if you don't want to ensure that tag replacement only occurs between 2 tags with the same name
if not tokensline[place] == tagsymbol+tagtype:
raise Exception("Tag mismatch:"+tagtype+str(tagnum)+" => "+tokensline[place])
tokensline[place] = tagdict[tag]
def dump_tagdict_to_file(dic, f):
cPickle.dump(dic, f)
def load_tagdict_from_file(f):
return cPickle.load(f)
def get_places_from_details_file(f, tagdict=None):
if tagdict:
#we're only interested in the tokens that match a tagtype in dict:
token_names_to_look_for=set([t[0] for t in tagdict.keys()])
else:
token_names_to_look_for=None
res={}
sourcesegindex = 0
targetsegindex = 0
while True:
line = f.readline()
if not line: raise Exception("EOF reached while parsing alignment details file")
if line.find("SCORES (UNWEIGHTED/WEIGHTED):") >=0:
return res
sourcematch = re.match("\s*SOURCE:\s*\[(\d+)\.\.\d+\]\s+(.*)",line, re.UNICODE)
if sourcematch:
assert(int(sourcematch.group(1)) == sourcesegindex)
sourcetoks = sourcematch.group(2).split()
line = f.readline()
targetmatch = re.match("\s*TRANSLATED AS:\s+(.*)", line)
targettoks = targetmatch.group(1).split()
line = f.readline()
aligmatch = re.match("\s*WORD ALIGNED:\s*([0-9].*)",line)
assert (aligmatch)
wordaligns = {}
for a in re.finditer("([0-9]+)-([0-9]+)", aligmatch.group(1)):
assert(int(a.group(1)) not in wordaligns)
wordaligns[int(a.group(1))] = int(a.group(2))
tokindex=0
for tok in sourcetoks:
if not tok.find(tagsymbol,0,1) == 0:
continue
tokenname = tok[1:]
if token_names_to_look_for and (tokenname not in token_names_to_look_for):
continue
#you can remove this assertion if you don't want to ensure that tag replacement only occurs between 2 tags with the same name
ttok = targettoks[wordaligns[tokindex]] ; assert (ttok == tagsymbol+tokenname)
if tokenname in res:
res[tokenname].append((sourcesegindex+tokindex, targetsegindex+wordaligns[tokindex]))
else:
res[tokenname] = [(sourcesegindex+tokindex, targetsegindex+wordaligns[tokindex])]
tokindex +=1
sourcesegindex += len(sourcetoks)
targetsegindex += len(targettoks)
def restore_all(infile, outfile, segmentation_file, tagsfile):
for line in infile:
dic = load_tagdict_from_file(tagsfile)
places = get_places_from_details_file(segmentation_file,dic)
tokensline=line.split()
restore_tags(tokensline, dic, places)
line = string.join(tokensline," ")+"\n"
outfile.write(line)
tag_regexps = {"NUM":"([0-9]+)", "HTMLTAG":"(<[^>]*>)"}
def tokenize_all(infile, outfile, tagsfile):
for line in infile:
dic={}
for tagtype in ["NUM", "HTMLTAG"]:
line = extract_tags(line, tag_regexps[tagtype], tagtype, dic)
dump_tagdict_to_file(dic, tagsfile)
outfile.write(line)
######################################################################
import sys, codecs
from os.path import basename
def usage():
print "Usage: "+basename(sys.argv[0])+ " tag <tagfile> (use /dev/null for training)"
print len("Usage: "+basename(sys.argv[0]))*" "+" detag <tagfile> <segmentation-info-file> (output of -moses -T)"
print "Both options read stdin and write on stdout"
sys.exit(1)
if __name__ == '__main__':
if len(sys.argv) < 2: usage()
if sys.argv[1] == "tag":
if not len(sys.argv) == 3: usage()
tagsfile=open(sys.argv[2], "w")
tokenize_all(sys.stdin, sys.stdout,tagsfile)
tagsfile.close()
sys.exit(0)
elif sys.argv[1] == "detag":
if not len(sys.argv) == 4: usage()
tagsfile=open(sys.argv[2], "r")
detailsfile=codecs.open(sys.argv[3], "r","utf8")
restore_all(sys.stdin, sys.stdout, detailsfile, tagsfile)
sys.exit(0)
else:
usage()
tag-wrapper
Description: Binary data
_______________________________________________ Moses-support mailing list [email protected] http://mailman.mit.edu/mailman/listinfo/moses-support
