Hi,
"train-model.perl" with the parameter "-phrase-word-alignment" adds
word-for-word alignment information to the phrase table. Unfortunately
this information get's lost when converting the textual phrase-table
into a binary format with "processPhraseTable". Using
"processPhraseTable -alignment-info" was meant to store the alignment
information in the binary table as well. This functionality is broken
since the format for the word alignment information changed and
currently no word alignment information is stored in the binary phrase
tables. Being required to use the textual file limits the size of the
phrase-table in respect to the memory on the server.
The attached patch provides the missing changes. It stores new-style
alignment information with the target candidates in the
"phrase-table.binphr.tgtdata.wa" file and reads them out correspondingly
(It doesn't split the alignment information into source and target
alignment as in the old implementation/format. It keeps it in a format
supported by "TargetPhrase::SetAlignmentInfo(std::string&)).
I tested the change with valgrind for both "moses" and
"processPhraseTable" in a smaller moses translation system without any
complaints. And both the translation and the alignment file that gets
produced with "moses -use-alignment-info -print-alignment-info -T
<File>" are identical, regardless of text or binary phrase-table. The
patch should not change the behavior for phrase-tables without
word-alignment.
I hope you find the patch useful and hopefully it can be committed to
repo. Of course, please let me know if any modifications are necessary
or desirable.
best regards
Christof
diff -wcr moses-2010-09-24/misc/queryPhraseTable.cpp
moses-2010-09-24.svn/misc/queryPhraseTable.cpp
*** moses-2010-09-24/misc/queryPhraseTable.cpp 2010-10-20 18:04:04.000000000
-0700
--- moses-2010-09-24.svn/misc/queryPhraseTable.cpp 2010-09-24
12:57:04.000000000 -0700
***************
*** 46,55 ****
srcphrase = Moses::Tokenize<std::string>(line);
std::vector<Moses::StringTgtCand> tgtcands;
! std::vector<std::string> wordAlignment;
if(useAlignments)
! ptree.GetTargetCandidates(srcphrase, tgtcands,
wordAlignment);
else
ptree.GetTargetCandidates(srcphrase, tgtcands);
--- 46,55 ----
srcphrase = Moses::Tokenize<std::string>(line);
std::vector<Moses::StringTgtCand> tgtcands;
! std::vector<Moses::StringWordAlignmentCand> src_wa, tgt_wa;
if(useAlignments)
! ptree.GetTargetCandidates(srcphrase, tgtcands, src_wa,
tgt_wa);
else
ptree.GetTargetCandidates(srcphrase, tgtcands);
***************
*** 60,66 ****
std::cout << " |||";
if(useAlignments) {
! std::cout << " " << wordAlignment[i] << " |||";
}
for(uint j = 0; j < tgtcands[i].second.size(); j++)
--- 60,78 ----
std::cout << " |||";
if(useAlignments) {
! for(uint j = 0; j < src_wa[i].second.size();
j++)
! if(src_wa[i].second[j] == "-1")
! std::cout << " ()";
! else
! std::cout << " (" <<
src_wa[i].second[j] << ")";
! std::cout << " |||";
!
! for(uint j = 0; j < tgt_wa[i].second.size();
j++)
! if(tgt_wa[i].second[j] == "-1")
! std::cout << " ()";
! else
! std::cout << " (" <<
tgt_wa[i].second[j] << ")";
! std::cout << " |||";
}
for(uint j = 0; j < tgtcands[i].second.size(); j++)
diff -wcr moses-2010-09-24/moses/src/PDTAimp.h
moses-2010-09-24.svn/moses/src/PDTAimp.h
*** moses-2010-09-24/moses/src/PDTAimp.h 2010-10-20 17:58:53.000000000
-0700
--- moses-2010-09-24.svn/moses/src/PDTAimp.h 2010-09-24 12:57:04.000000000
-0700
***************
*** 160,167 ****
// get target phrases in string representation
std::vector<StringTgtCand> cands;
! std::vector<std::string> wacands;
! m_dict->GetTargetCandidates(srcString,cands,wacands);
if(cands.empty())
{
return 0;
--- 160,169 ----
// get target phrases in string representation
std::vector<StringTgtCand> cands;
! std::vector<StringWordAlignmentCand> swacands;
! std::vector<StringWordAlignmentCand> twacands;
! // m_dict->GetTargetCandidates(srcString,cands);
! m_dict->GetTargetCandidates(srcString,cands,swacands,twacands);
if(cands.empty())
{
return 0;
***************
*** 177,190 ****
StringTgtCand::first_type const&
factorStrings=cands[i].first;
StringTgtCand::second_type const&
probVector=cands[i].second;
std::vector<float> scoreVector(probVector.size());
std::transform(probVector.begin(),probVector.end(),scoreVector.begin(),
TransformScore);
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),
FloorScore);
!
//CreateTargetPhrase(targetPhrase,factorStrings,scoreVector,&src);
!
CreateTargetPhrase(targetPhrase,factorStrings,scoreVector,wacands[i],&src);
costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
tCands.push_back(targetPhrase);
}
--- 179,194 ----
StringTgtCand::first_type const&
factorStrings=cands[i].first;
StringTgtCand::second_type const&
probVector=cands[i].second;
+ //StringWordAlignmentCand::second_type const&
swaVector=swacands[i].second;
+ //StringWordAlignmentCand::second_type const&
twaVector=twacands[i].second;
std::vector<float> scoreVector(probVector.size());
std::transform(probVector.begin(),probVector.end(),scoreVector.begin(),
TransformScore);
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),
FloorScore);
!
CreateTargetPhrase(targetPhrase,factorStrings,scoreVector,&src);
!
//CreateTargetPhrase(targetPhrase,factorStrings,scoreVector,swaVector,twaVector,&src);
costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
tCands.push_back(targetPhrase);
}
***************
*** 275,289 ****
};
- void CreateTargetPhrase(TargetPhrase& targetPhrase,
-
StringTgtCand::first_type const& factorStrings,
-
StringTgtCand::second_type const& scoreVector,
-
const std::string& alignmentString,
-
Phrase const* srcPtr=0) const
- {
- CreateTargetPhrase(targetPhrase, factorStrings, scoreVector,
srcPtr);
- targetPhrase.SetAlignmentInfo(alignmentString);
- }
void CreateTargetPhrase(TargetPhrase& targetPhrase,
StringTgtCand::first_type const& factorStrings,
--- 279,284 ----
***************
*** 301,306 ****
--- 296,303 ----
}
targetPhrase.SetScore(m_obj->GetFeature(), scoreVector,
m_weights, m_weightWP, *m_languageModels);
targetPhrase.SetSourcePhrase(srcPtr);
+
+ // targetPhrase.CreateAlignmentInfo("???", "???", 44);
}
diff -wcr moses-2010-09-24/moses/src/PhraseDictionaryTree.cpp
moses-2010-09-24.svn/moses/src/PhraseDictionaryTree.cpp
*** moses-2010-09-24/moses/src/PhraseDictionaryTree.cpp 2010-10-21
14:49:39.000000000 -0700
--- moses-2010-09-24.svn/moses/src/PhraseDictionaryTree.cpp 2010-09-24
12:57:04.000000000 -0700
***************
*** 7,13 ****
#include <iostream>
#include <fstream>
#include <string>
- #include <vector>
namespace Moses
{
--- 7,12 ----
***************
*** 26,39 ****
class TgtCand {
IPhrase e;
Scores sc;
! std::string m_alignment;
public:
TgtCand() {}
! TgtCand(const IPhrase& a, const Scores& b , const std::string&
alignment)
: e(a)
, sc(b)
! , m_alignment(alignment)
{}
TgtCand(const IPhrase& a,const Scores& b) : e(a),sc(b) {}
--- 25,40 ----
class TgtCand {
IPhrase e;
Scores sc;
! WordAlignments m_sourceAlignment, m_targetAlignment;
public:
TgtCand() {}
! TgtCand(const IPhrase& a, const Scores& b
! , const WordAlignments
&sourceAlignment, const WordAlignments &targetAlignment)
: e(a)
, sc(b)
! , m_sourceAlignment(sourceAlignment)
! , m_targetAlignment(targetAlignment)
{}
TgtCand(const IPhrase& a,const Scores& b) : e(a),sc(b) {}
***************
*** 57,75 ****
{
fWriteVector(f,e);
fWriteVector(f,sc);
! fWriteString(f, m_alignment.c_str(), m_alignment.size());
}
void readBinWithAlignment(FILE* f)
{
fReadVector(f,e);
fReadVector(f,sc);
! fReadString(f, m_alignment);
}
const IPhrase& GetPhrase() const {return e;}
const Scores& GetScores() const {return sc;}
! const std::string& GetAlignment() const {return m_alignment;}
};
--- 58,79 ----
{
fWriteVector(f,e);
fWriteVector(f,sc);
! fWriteStringVector(f, m_sourceAlignment);
! fWriteStringVector(f, m_targetAlignment);
}
void readBinWithAlignment(FILE* f)
{
fReadVector(f,e);
fReadVector(f,sc);
! fReadStringVector(f, m_sourceAlignment);
! fReadStringVector(f, m_targetAlignment);
}
const IPhrase& GetPhrase() const {return e;}
const Scores& GetScores() const {return sc;}
! const WordAlignments& GetSourceAlignment() const {return
m_sourceAlignment;}
! const WordAlignments& GetTargetAlignment() const {return
m_targetAlignment;}
};
***************
*** 210,216 ****
// convert target candidates from internal data structure to
the external one
void ConvertTgtCand(const TgtCands& tcands,std::vector<StringTgtCand>&
rv,
!
std::vector<std::string>& wa) const
{
for(TgtCands::const_iterator
i=tcands.begin();i!=tcands.end();++i)
{
--- 214,221 ----
// convert target candidates from internal data structure to
the external one
void ConvertTgtCand(const TgtCands& tcands,std::vector<StringTgtCand>&
rv,
!
std::vector<StringWordAlignmentCand>& swa,
!
std::vector<StringWordAlignmentCand>& twa) const
{
for(TgtCands::const_iterator
i=tcands.begin();i!=tcands.end();++i)
{
***************
*** 221,227 ****
for(size_t j=0;j<iphrase.size();++j)
vs.push_back(&tv->symbol(iphrase[j]));
rv.push_back(StringTgtCand(vs,i->GetScores()));
! wa.push_back(i->GetAlignment());
}
}
--- 226,233 ----
for(size_t j=0;j<iphrase.size();++j)
vs.push_back(&tv->symbol(iphrase[j]));
rv.push_back(StringTgtCand(vs,i->GetScores()));
!
swa.push_back(StringWordAlignmentCand(vs,(i->GetSourceAlignment())));
!
twa.push_back(StringWordAlignmentCand(vs,(i->GetTargetAlignment())));
}
}
***************
*** 326,338 ****
{
Scores sc=tcand[i].GetScores();
! std::string trgAlign = tcand[i].GetAlignment();
const IPhrase& iphr=tcand[i].GetPhrase();
out << i << " -- " << sc << " -- ";
for(size_t j=0;j<iphr.size();++j) out <<
tv->symbol(iphr[j])<<" ";
! out<< " -- " << trgAlign;
out << std::endl;
}
}
--- 332,348 ----
{
Scores sc=tcand[i].GetScores();
! WordAlignments
srcAlign=tcand[i].GetSourceAlignment();
! WordAlignments
trgAlign=tcand[i].GetTargetAlignment();
const IPhrase& iphr=tcand[i].GetPhrase();
out << i << " -- " << sc << " -- ";
for(size_t j=0;j<iphr.size();++j) out <<
tv->symbol(iphr[j])<<" ";
! out<< " -- ";
! for (size_t j=0;j<srcAlign.size();j++) out <<
" " << srcAlign[j];
! out << " -- ";
! for (size_t j=0;j<trgAlign.size();j++) out <<
" " << trgAlign[j];
out << std::endl;
}
}
***************
*** 390,396 ****
void PhraseDictionaryTree::
GetTargetCandidates(const std::vector<std::string>& src,
std::vector<StringTgtCand>& rv,
!
std::vector<std::string>& wa) const
{
IPhrase f(src.size());
for(size_t i=0;i<src.size();++i)
--- 400,407 ----
void PhraseDictionaryTree::
GetTargetCandidates(const std::vector<std::string>& src,
std::vector<StringTgtCand>& rv,
!
std::vector<StringWordAlignmentCand>& swa,
!
std::vector<StringWordAlignmentCand>& twa) const
{
IPhrase f(src.size());
for(size_t i=0;i<src.size();++i)
***************
*** 401,407 ****
TgtCands tgtCands;
imp->GetTargetCandidates(f,tgtCands);
! imp->ConvertTgtCand(tgtCands,rv,wa);
}
--- 412,418 ----
TgtCands tgtCands;
imp->GetTargetCandidates(f,tgtCands);
! imp->ConvertTgtCand(tgtCands,rv,swa,twa);
}
***************
*** 480,489 ****
const std::string &sourcePhraseString =tokens[0]
,&targetPhraseString=tokens[1]
,&scoreString = tokens[2];
! const std::string empty;
! const std::string &alignmentString = PrintWordAlignment() ?
tokens[3] : empty;
IPhrase f,e;
Scores sc;
std::vector<std::string> wordVec = Tokenize(sourcePhraseString);
for (size_t i = 0 ; i < wordVec.size() ; ++i)
--- 491,500 ----
const std::string &sourcePhraseString =tokens[0]
,&targetPhraseString=tokens[1]
,&scoreString = tokens[2];
!
IPhrase f,e;
Scores sc;
+ WordAlignments sourceAlignment, targetAlignment;
std::vector<std::string> wordVec = Tokenize(sourcePhraseString);
for (size_t i = 0 ; i < wordVec.size() ; ++i)
***************
*** 565,571 ****
abort();
}
}
! tgtCands.push_back(TgtCand(e,sc, alignmentString));
assert(currFirstWord!=InvalidLabelId);
}
if (PrintWordAlignment())
--- 576,582 ----
abort();
}
}
! tgtCands.push_back(TgtCand(e,sc, sourceAlignment,
targetAlignment));
assert(currFirstWord!=InvalidLabelId);
}
if (PrintWordAlignment())
***************
*** 650,660 ****
void PhraseDictionaryTree::
GetTargetCandidates(PrefixPtr p,
std::vector<StringTgtCand>& rv,
!
std::vector<std::string>& wa) const
{
TgtCands tcands;
imp->GetTargetCandidates(p,tcands);
! imp->ConvertTgtCand(tcands,rv,wa);
}
std::string PhraseDictionaryTree::GetScoreProducerDescription() const{
--- 661,672 ----
void PhraseDictionaryTree::
GetTargetCandidates(PrefixPtr p,
std::vector<StringTgtCand>& rv,
!
std::vector<StringWordAlignmentCand>& swa,
!
std::vector<StringWordAlignmentCand>& twa) const
{
TgtCands tcands;
imp->GetTargetCandidates(p,tcands);
! imp->ConvertTgtCand(tcands,rv,swa,twa);
}
std::string PhraseDictionaryTree::GetScoreProducerDescription() const{
diff -wcr moses-2010-09-24/moses/src/PhraseDictionaryTree.h
moses-2010-09-24.svn/moses/src/PhraseDictionaryTree.h
*** moses-2010-09-24/moses/src/PhraseDictionaryTree.h 2010-10-20
18:14:57.000000000 -0700
--- moses-2010-09-24.svn/moses/src/PhraseDictionaryTree.h 2010-09-24
12:57:04.000000000 -0700
***************
*** 79,85 ****
// get the target candidates for a given phrase
void GetTargetCandidates(const std::vector<std::string>& src,
std::vector<StringTgtCand>& rv,
!
std::vector<std::string>& wa) const;
/*****************************
* access to prefix tree *
--- 79,86 ----
// get the target candidates for a given phrase
void GetTargetCandidates(const std::vector<std::string>& src,
std::vector<StringTgtCand>& rv,
!
std::vector<StringWordAlignmentCand>& swa,
!
std::vector<StringWordAlignmentCand>& twa) const;
/*****************************
* access to prefix tree *
***************
*** 111,117 ****
std::vector<StringTgtCand>& rv) const;
void GetTargetCandidates(PrefixPtr p,
std::vector<StringTgtCand>& rv,
!
std::vector<std::string>& wa) const;
// print target candidates for a given prefix pointer to a stream,
mainly
// for debugging
--- 112,119 ----
std::vector<StringTgtCand>& rv) const;
void GetTargetCandidates(PrefixPtr p,
std::vector<StringTgtCand>& rv,
!
std::vector<StringWordAlignmentCand>& swa,
!
std::vector<StringWordAlignmentCand>& twa) const;
// print target candidates for a given prefix pointer to a stream,
mainly
// for debugging
_______________________________________________
Moses-support mailing list
Moses-support@mit.edu
http://mailman.mit.edu/mailman/listinfo/moses-support