On Friday 06 August 2010 14:06:55 Sylvain Raybaud wrote:
> * moses cannot translate "real world" graph generated with this tool.
> See for example: http://perso.crans.org/raybaud/realworld.slf.gz
got it: sphinx hypothesis contains a quote:
nous incombe d' assurer
which ends up like this in plf:
('d'',x,y)
see example7 attached (with quote) and example7.cleaned (no quote): the former
gives me the same error, the later is translated just fine.
how are we supposed to deal with quotes in PLF?
regards,
--
Sylvain
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#
# Copyright (C) 2010, Sylvain Raybaud <[email protected]>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Sylvain Raybaud nor the names of its contributors may
# be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY Sylvain Raybaud <[email protected]> ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Sylvain Raybaud BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# This piece of software turns a word lattice in SLF (HTK format) and converts it
# to PLF (Moses input format)
# SLF is documented there: http://labrosa.ee.columbia.edu/doc/HTKBook21/node257.html
# PLF is documented there: http://www.statmt.org/moses/?n=Moses.WordLattices
# I tried to keep most of the flexibility of SLF
#
# Usage: slf2plf [options] input_file > output
# or: slf2plf [options] input_file.gz > output
# or: slf2plf [options] < input_file > output
# Options:
# -C, --dont-clean: don't remove noise information from input lattice
# -h, --help: print short help
#
# Principle:
# 1) load graph into memory as a list of nodes and a list of links
# each node has access to the list of its incoming and outgoing
# links
# 2) Compute a topological ordering of nodes
# 3) Output lattice in moses input format
#
# The algorithm implemented for topological sort is the one described here on the 6 august 2010:
# http://en.wikipedia.org/wiki/Topological_sorting
#
# TODO:
# a) Make the program robust to missing accoustic and lm scores in the lattice
# b) Make it possible to choose how output lattice score is computed from nodes and link attributes
# c) Deal with sublattices
# d) Make the program robust to incorrect input
USAGE = """
%prog [options] input_file > output
or: %prog [options] input_file.gz > output
or: %prog [options] < input_file > output
input_file is a graph in SLF format (HTK), possibly gziped (.gz extension required)."""
from optparse import OptionParser
import gzip
import sys
import re
import copy
#general purpose tools to make your life easier
def open_plain_or_gz(filename,mode='r'):
"""Transparently open gziped or plain files, based on file extension"""
return gzip.open(filename,mode) if filename.endswith(".gz") else open(filename,mode)
def verbose_msg(opts,msg):
if opts.verbose:
sys.stderr.write(msg+'\n')
def warn(msg):
"""Print a warning message to stderr. The program continues."""
sys.stderr.write("*** WARNING *** %s\n" % msg)
def error(msg,code=1):
"""Print an error message to stderr and exists with the given code (default: 1)."""
sys.stderr.write("*** ERROR *** %s\n" % msg)
exit(code)
# regular expressions that will be used later
__attr_re__ = r"(?P<attribute>[^=]+)=(?P<quote>(\"|'')?)(?P<value>.*)(?P=quote)" # filed_name=field_value value may be "quoted"
__size_re__ = r"(?P<attribute>N|NODES|L|LINKS)=(?P<value>\d+)" # for parsing size information in SLF
__node_re__ = r"I=(?P<id>\d+)(?P<attributes>(\s+%s)*)" % __attr_re__ # for parsing node definition in SLF
__link_re__ = r"J=(?P<id>\d+)(?P<attributes>(\s+%s)*)" % __attr_re__ # for parsing link information in SLF
__noise_re__ = r"\+\+[^\+]*\+\+"
# a few mandatory attributes in short and long forms. They will be stored in long form
ATTRIBUTES = {
'E': 'END',
'S': 'START',
'W': 'WORD',
'a': 'acoustic',
'l': 'language'
}
def __get_long_form__(string):
if ATTRIBUTES.has_key(string):
return ATTRIBUTES[string]
else:
return string
# remove comments and blank lines from input
def __clean__(lines):
data = []
for l in lines:
if l.strip()!='' and l[0]!='#':
data.append(re.sub("#.*\n",'',l))
return data
# print a link out of the first node in the list 'remains'
def __print_links_out__(lattice,remains):
n = lattice.nodes[remains[0]]
sys.stdout.write("(")
for l in n.outgoing:
link = lattice.links[l]
distance = remains.index(link.to_node) # this is the distance between current node and the node it is linked to
a = float(link.attributes['acoustic'])
l = float(link.attributes['language'])
sys.stdout.write("('%s',%f,%i)," % (link.attributes['WORD'],a+l*lattice.lmscale,distance))
sys.stdout.write("),")
class Node:
""" Node definition for using in word lattice """
def __init__(self,index,attributes_string=None):
"""index is just the id of the node. attribute_string is a succession fo definition of the form: name=value"""
self.index = index
self.incoming = set()
self.outgoing = set()
self.attributes = {}
if attributes_string:
attributes = attributes_string.split()
for attr_str in attributes:
m = re.match(__attr_re__,attr_str)
attr = __get_long_form__(m.group('attribute'))
val = m.group('value')
if self.attributes.has_key(attr):
warn("Node %i: attribute '%s' redefined." % (self.index,attr))
self.attributes[attr] = val
class Link:
""" Link definition for using in word lattice """
def __init__(self,index,attributes_string=None):
"""index is just the id of the link. attribute_string is a succession fo definition of the form: name=value"""
self.index = index
self.from_node = -1
self.to_node = -1
self.word = ""
self.attributes = {}
if attributes_string:
attributes = attributes_string.split()
for attr_str in attributes:
m = re.match(__attr_re__,attr_str)
attr = __get_long_form__(m.group('attribute'))
val = m.group('value')
if self.attributes.has_key(attr):
warn("Node %i: attribute '%s' redefined." % (self.index,attr))
self.attributes[attr] = val
try:
self.from_node = int(self.attributes['START'])
self.to_node = int(self.attributes['END'])
except:
error("Incomplete definition of link %i" % self.index)
#TODO: check that word attribute is defined either here or on destination node
class Lattice:
""" Class representing a word lattice with constructor and method for loading a lattice in HTK Standard Lattice Format.
Also has a method for writing the lattice in Moses input format (PLF) """
def __init__(self,lattfile=None,lines=None,clean=True): # can read lattice either from file or data in memory
""" lattfile should be an open stream. lines should contain a reversed list of already read lines. At least one of them must be provided. Option 'clean' control weither or not noise information should be removes from the lattice. """
data = []
if lattfile==None and (lines==None or len(lines)==0):
error("Where shall I read the lattice from?")
if lattfile!=None:
data = lattfile.readlines() #not very memory efficient, but easier for parsing
data.reverse() # lines to be parsed will be stored on a stack
if lines!=None:
data = data+lines
data = __clean__(data) #remove comments and empty lines
self.nodes_count = -1
self.links_count = -1
self.nodes = {}
self.links = {}
self.sublattices = {}
self.lmscale = 1.0
self.attributes = {}
self.__load_graph__(data,clean)
def __load_graph__(self,lines,clean):
self.__parse_header__(lines)
self.__parse_counts__(lines)
self.__parse_nodes__(lines)
removed_links = self.__parse_links__(lines,clean)
removed_nodes = 0
if clean:
removed_nodes = self.remove_orphan_nodes()
if len(self.nodes)+removed_nodes!=self.nodes_count or len(self.links)+removed_links!=self.links_count:
warn("Incorrect nodes or links count")
if self.attributes.has_key('lmscale'):
self.lmscale = float(self.attributes['lmscale'])
def __parse_header__(self,lines):
attributes_tmp = {}
go = True # carry on as long as we are parsing a header
while len(lines)>0 and go:
l = lines.pop()
m = re.match(__attr_re__,l)
if m and not re.match(__size_re__+".*",l): # should match an attribute but *not* a size definition
attr = m.group('attribute')
val = m.group('value')
if attributes_tmp.has_key(attr):
warn("Attribute '%s' redefined." % attr)
attributes_tmp[attr] = val
if attr=="SUBLAT": # we are parsing a sublattice. go down, create it and then carry on.
sublat = Lattice(lines=lines)
# merge attributes
for k in attributes_tmp:
if sublat.attributes.has_key(k):
warn("Sublattice '%s': attribute '%s' redefined." % (val,k))
sublat.attributes.update(attributes_tmp)
attributes_tmp = {}
self.sublattices[val] = sublat
if lines.pop().strip()!=":":
warn("Missing colon at the end of sublattice '%s'. Continuing anyway, but things could go wrong." % val)
else:
go = False
if not go:
lines.append(l)
self.attributes = attributes_tmp
# If I understand correctly counts may be written in any order, on 1 or 2 lines
# Therefore parsed lines need to be split
def __parse_counts__(self,lines):
go = True
while len(lines)>0 and go:
l = lines.pop()
terms = l.split()
while len(terms)>0 and go:
t = terms.pop()
m = re.match(__size_re__,t)
if m:
if m.group('attribute')=='N' or m.group('attribute')=='NODES':
if self.nodes_count!=-1:
warn("Nodes count redefined.")
self.nodes_count = int(m.group('value'))
elif m.group('attribute')=='L' or m.group('attribute')=='LINKS':
if self.links_count!=-1:
warn("Links count redefined.")
self.links_count = int(m.group('value'))
else:
go = False
if not go:
lines.append(l)
def __parse_nodes__(self,lines):
go = True
while len(lines)>0 and go:
l = lines.pop()
m = re.match(__node_re__,l)
if m:
index = int(m.group('id'))
n = Node(index,m.group('attributes'))
if self.nodes.has_key(index):
warn("Node %i redefined." % index)
self.nodes[index] = n
else:
go = False
if not go:
lines.append(l)
def __parse_links__(self,lines,clean):
removed_links = 0
go = True
while len(lines)>0 and go:
l = lines.pop()
m = re.match(__link_re__,l)
if m:
index = int(m.group('id'))
if self.links.has_key(index):
warn("Link %i redefined." % index)
e = Link(index,m.group('attributes'))
# if link is not noise, or if no cleaning required, insert link
if (not clean) or (not 'WORD' in e.attributes) or (not re.match(__noise_re__,e.attributes['WORD'])):
self.__update_nodes__(e) # Information about nodes is contained in link definition
self.links[index] = e
else:
removed_links += 1
else:
go = False
if not go:
lines.append(l)
return removed_links
def __update_nodes__(self,link):
# in case the nodes were not explicitely defined
if not self.nodes.has_key(link.from_node):
self.nodes[link.from_node] = Node(link.from_node)
if not self.nodes.has_key(link.to_node):
self.nodes[link.to_node] = Node(link.to_node)
# update connection information of nodes
self.nodes[link.from_node].outgoing.add(link.index)
self.nodes[link.to_node].incoming.add(link.index)
def remove_orphan_nodes(self):
""" Put orphaned nodes out of their misery """
removed_nodes = 0
for n in self.nodes.values():
if len(n.incoming)==0 and len(n.outgoing)==0:
self.nodes.pop(n.index)
removed_nodes += 1
return removed_nodes
# write lattice in PLF
def write_plf(self):
""" Write lattice in moses input format (PLF) on standard output. """
# get topological ordering
ordered_nodes = topological_ordering(self)
# NOT THAT EASY
# ordered nodes of a sublattice can be inserted at the position of the node they replace
#for i,n in len(ordered_nodes):
#if self.nodes[n].has_key('L'): # this node is in fact a sublattice
#sublat_ordered = self.sublattices
# write lattice
sys.stdout.write("(")
for i,n in enumerate(ordered_nodes[:-1]): # list all nodes in topological order
__print_links_out__(self,ordered_nodes[i:]) # and write links leaving them
sys.stdout.write(")\n")
return
# topological sorting algorithm needs to be initialised with a list of initial nodes having no incoming link
def get_starting_nodes(lattice):
""" Given an instance of class Lattice, output the list of all nodes with no incoming link """
S = []
for n in lattice.nodes.values():
if len(n.incoming)==0:
S.append(n.index)
return S
def topological_ordering(original_lattice):
""" Given an instance of class Lattice, output a topological ordering of nodes (as a list of indexes) """
lattice = copy.deepcopy(original_lattice) # the lattice will be modified by the program. Work on a copy.
start_nodes = get_starting_nodes(lattice)
ordered_nodes = []
while len(start_nodes)>0:
n = start_nodes.pop()
ordered_nodes.append(n)
for l in set(lattice.nodes[n].outgoing):
m = lattice.nodes[lattice.links[l].to_node]
lattice.links.pop(l)
m.incoming.remove(l)
lattice.nodes[n].outgoing.remove(l)
if len(m.incoming)==0:
start_nodes.append(m.index)
if len(lattice.links)>0:
warn("Lattice has at least one cycle.")
return ordered_nodes
if __name__=='__main__':
parser = OptionParser(usage=USAGE)
parser.add_option('-C','--dont-clean',default=True,action="store_false",dest="clean",help="Don't clean input graph: word lattice graph generated by sphinx contain information about noise which should be removed for translation. Use this option if you want to keep it. [clean]")
parser.add_option('-v','--verbose',default=False,action="store_true",dest="verbose",help="verbosity [%default]")
opts,args = parser.parse_args()
if len(args)>1:
parser.print_help()
error("bad command line.",1)
input_file = sys.stdin if (len(args)==0 or args[0]=="-") else open_plain_or_gz(args[0])
lattice = Lattice(input_file,clean=opts.clean)
lattice.write_plf()
if not (len(args)==0 or args[0]=="-"):
input_file.close()
VERSION=1.1
UTTERANCE=1
lmscale=1
wdpenalty=0.0
#
# Size line
#
N=5 L=5
#
# Node definitions
#
I=0
I=1
I=2
I=3
I=4
#
# Link definitions.
#
J=0 S=0 E=1 W=nous a=0 l=0
J=1 S=1 E=2 W=incombe a=0 l=0
J=2 S=2 E=3 W=d' a=0 l=0
J=3 S=3 E=4 W=assurer a=0 l=0
J=4 S=0 E=2 W=incombe a=0 l=0
((('nous',0.000000,1),('incombe',0.000000,2),),(('incombe',0.000000,1),),(('d'',0.000000,1),),(('assurer',0.000000,1),),)
((('nous',0.000000,1),('incombe',0.000000,2),),(('incombe',0.000000,1),),(('d',0.000000,1),),(('assurer',0.000000,1),),)
_______________________________________________
Moses-support mailing list
[email protected]
http://mailman.mit.edu/mailman/listinfo/moses-support