Hi,

"train-model.perl" with the parameter "-phrase-word-alignment" adds word-for-word alignment information to the phrase table. Unfortunately this information get's lost when converting the textual phrase-table into a binary format with "processPhraseTable". Using "processPhraseTable -alignment-info" was meant to store the alignment information in the binary table as well. This functionality is broken since the format for the word alignment information changed and currently no word alignment information is stored in the binary phrase tables. Being required to use the textual file limits the size of the phrase-table in respect to the memory on the server.

The attached patch provides the missing changes. It stores new-style alignment information with the target candidates in the "phrase-table.binphr.tgtdata.wa" file and reads them out correspondingly (It doesn't split the alignment information into source and target alignment as in the old implementation/format. It keeps it in a format supported by "TargetPhrase::SetAlignmentInfo(std::string&)).

I tested the change with valgrind for both "moses" and "processPhraseTable" in a smaller moses translation system without any complaints. And both the translation and the alignment file that gets produced with "moses -use-alignment-info -print-alignment-info -T <File>" are identical, regardless of text or binary phrase-table. The patch should not change the behavior for phrase-tables without word-alignment.

I hope you find the patch useful and hopefully it can be committed to repo. Of course, please let me know if any modifications are necessary or desirable.

best regards
Christof
diff -wcr moses-2010-09-24/misc/queryPhraseTable.cpp 
moses-2010-09-24.svn/misc/queryPhraseTable.cpp
*** moses-2010-09-24/misc/queryPhraseTable.cpp  2010-10-20 18:04:04.000000000 
-0700
--- moses-2010-09-24.svn/misc/queryPhraseTable.cpp      2010-09-24 
12:57:04.000000000 -0700
***************
*** 46,55 ****
                srcphrase = Moses::Tokenize<std::string>(line);
  
                std::vector<Moses::StringTgtCand> tgtcands;
!               std::vector<std::string> wordAlignment;
  
                if(useAlignments)
!                       ptree.GetTargetCandidates(srcphrase, tgtcands, 
wordAlignment);
                else
                        ptree.GetTargetCandidates(srcphrase, tgtcands);
  
--- 46,55 ----
                srcphrase = Moses::Tokenize<std::string>(line);
  
                std::vector<Moses::StringTgtCand> tgtcands;
!               std::vector<Moses::StringWordAlignmentCand> src_wa, tgt_wa;
  
                if(useAlignments)
!                       ptree.GetTargetCandidates(srcphrase, tgtcands, src_wa, 
tgt_wa);
                else
                        ptree.GetTargetCandidates(srcphrase, tgtcands);
  
***************
*** 60,66 ****
                        std::cout << " |||";
  
                        if(useAlignments) {
!                               std::cout << " " << wordAlignment[i] << " |||";
                        }
  
                        for(uint j = 0; j < tgtcands[i].second.size(); j++)
--- 60,78 ----
                        std::cout << " |||";
  
                        if(useAlignments) {
!                               for(uint j = 0; j < src_wa[i].second.size(); 
j++)
!                                       if(src_wa[i].second[j] == "-1")
!                                               std::cout << " ()";
!                                       else
!                                               std::cout << " (" << 
src_wa[i].second[j] << ")";
!                               std::cout << " |||";
! 
!                               for(uint j = 0; j < tgt_wa[i].second.size(); 
j++)
!                                       if(tgt_wa[i].second[j] == "-1")
!                                               std::cout << " ()";
!                                       else
!                                               std::cout << " (" << 
tgt_wa[i].second[j] << ")";
!                               std::cout << " |||";
                        }
  
                        for(uint j = 0; j < tgtcands[i].second.size(); j++)
diff -wcr moses-2010-09-24/moses/src/PDTAimp.h 
moses-2010-09-24.svn/moses/src/PDTAimp.h
*** moses-2010-09-24/moses/src/PDTAimp.h        2010-10-20 17:58:53.000000000 
-0700
--- moses-2010-09-24.svn/moses/src/PDTAimp.h    2010-09-24 12:57:04.000000000 
-0700
***************
*** 160,167 ****
  
                // get target phrases in string representation
                std::vector<StringTgtCand> cands;
!               std::vector<std::string> wacands;
!               m_dict->GetTargetCandidates(srcString,cands,wacands);
                if(cands.empty()) 
                {
                        return 0;
--- 160,169 ----
  
                // get target phrases in string representation
                std::vector<StringTgtCand> cands;
!               std::vector<StringWordAlignmentCand> swacands;
!               std::vector<StringWordAlignmentCand> twacands;
! //            m_dict->GetTargetCandidates(srcString,cands);
!               m_dict->GetTargetCandidates(srcString,cands,swacands,twacands);
                if(cands.empty()) 
                {
                        return 0;
***************
*** 177,190 ****
                        
                        StringTgtCand::first_type const& 
factorStrings=cands[i].first;
                        StringTgtCand::second_type const& 
probVector=cands[i].second;
                        
                        std::vector<float> scoreVector(probVector.size());
                        
std::transform(probVector.begin(),probVector.end(),scoreVector.begin(),
                                                                                
 TransformScore);
                        
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),
                                                                                
 FloorScore);
!                       
//CreateTargetPhrase(targetPhrase,factorStrings,scoreVector,&src);
!                       
CreateTargetPhrase(targetPhrase,factorStrings,scoreVector,wacands[i],&src);
                        
costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
                        tCands.push_back(targetPhrase);
                }
--- 179,194 ----
                        
                        StringTgtCand::first_type const& 
factorStrings=cands[i].first;
                        StringTgtCand::second_type const& 
probVector=cands[i].second;
+                       //StringWordAlignmentCand::second_type const& 
swaVector=swacands[i].second;
+                       //StringWordAlignmentCand::second_type const& 
twaVector=twacands[i].second;
                        
                        std::vector<float> scoreVector(probVector.size());
                        
std::transform(probVector.begin(),probVector.end(),scoreVector.begin(),
                                                                                
 TransformScore);
                        
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),
                                                                                
 FloorScore);
!                       
CreateTargetPhrase(targetPhrase,factorStrings,scoreVector,&src);
!                       
//CreateTargetPhrase(targetPhrase,factorStrings,scoreVector,swaVector,twaVector,&src);
                        
costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
                        tCands.push_back(targetPhrase);
                }
***************
*** 275,289 ****
        };
  
  
-       void CreateTargetPhrase(TargetPhrase& targetPhrase,
-                                                                               
                        StringTgtCand::first_type const& factorStrings,
-                                                                               
                        StringTgtCand::second_type const& scoreVector,
-                                                                               
                        const std::string& alignmentString, 
-                                                                               
                        Phrase const* srcPtr=0) const
-       {
-               CreateTargetPhrase(targetPhrase, factorStrings, scoreVector, 
srcPtr);
-               targetPhrase.SetAlignmentInfo(alignmentString);
-       }       
  
        void CreateTargetPhrase(TargetPhrase& targetPhrase,
                                                                                
                        StringTgtCand::first_type const& factorStrings,
--- 279,284 ----
***************
*** 301,306 ****
--- 296,303 ----
                }
                targetPhrase.SetScore(m_obj->GetFeature(), scoreVector, 
m_weights, m_weightWP, *m_languageModels);
                targetPhrase.SetSourcePhrase(srcPtr);
+               
+ //            targetPhrase.CreateAlignmentInfo("???", "???", 44);
        }
        
        
diff -wcr moses-2010-09-24/moses/src/PhraseDictionaryTree.cpp 
moses-2010-09-24.svn/moses/src/PhraseDictionaryTree.cpp
*** moses-2010-09-24/moses/src/PhraseDictionaryTree.cpp 2010-10-21 
14:49:39.000000000 -0700
--- moses-2010-09-24.svn/moses/src/PhraseDictionaryTree.cpp     2010-09-24 
12:57:04.000000000 -0700
***************
*** 7,13 ****
  #include <iostream>
  #include <fstream>
  #include <string>
- #include <vector>
  
  namespace Moses
  {
--- 7,12 ----
***************
*** 26,39 ****
  class TgtCand {
        IPhrase e;
        Scores sc;
!       std::string m_alignment;
  public:
        TgtCand() {}
        
!       TgtCand(const IPhrase& a, const Scores& b , const std::string& 
alignment) 
                : e(a)
                , sc(b)
!               , m_alignment(alignment)
        {}
        
        TgtCand(const IPhrase& a,const Scores& b) : e(a),sc(b) {}
--- 25,40 ----
  class TgtCand {
        IPhrase e;
        Scores sc;
!       WordAlignments m_sourceAlignment, m_targetAlignment;
  public:
        TgtCand() {}
        
!       TgtCand(const IPhrase& a, const Scores& b
!                                       , const WordAlignments 
&sourceAlignment, const WordAlignments &targetAlignment) 
                : e(a)
                , sc(b)
!               , m_sourceAlignment(sourceAlignment)
!               , m_targetAlignment(targetAlignment)
        {}
        
        TgtCand(const IPhrase& a,const Scores& b) : e(a),sc(b) {}
***************
*** 57,75 ****
        {
                fWriteVector(f,e);
                fWriteVector(f,sc);
!               fWriteString(f, m_alignment.c_str(), m_alignment.size());
        }
        
        void readBinWithAlignment(FILE* f) 
        {
                fReadVector(f,e);
                fReadVector(f,sc);
!               fReadString(f, m_alignment);
        } 
        
        const IPhrase& GetPhrase() const {return e;}
        const Scores& GetScores() const {return sc;}
!       const std::string& GetAlignment() const {return m_alignment;}
  };
    
  
--- 58,79 ----
        {
                fWriteVector(f,e);
                fWriteVector(f,sc);
!               fWriteStringVector(f, m_sourceAlignment);
!               fWriteStringVector(f, m_targetAlignment);
        }
        
        void readBinWithAlignment(FILE* f) 
        {
                fReadVector(f,e);
                fReadVector(f,sc);
!               fReadStringVector(f, m_sourceAlignment);
!               fReadStringVector(f, m_targetAlignment);
        } 
        
        const IPhrase& GetPhrase() const {return e;}
        const Scores& GetScores() const {return sc;}
!       const WordAlignments& GetSourceAlignment() const {return 
m_sourceAlignment;}
!       const WordAlignments& GetTargetAlignment() const {return 
m_targetAlignment;}
  };
    
  
***************
*** 210,216 ****
        
                // convert target candidates from internal data structure to 
the external one
        void ConvertTgtCand(const TgtCands& tcands,std::vector<StringTgtCand>& 
rv,
!                                                                               
        std::vector<std::string>& wa) const
        {
                for(TgtCands::const_iterator 
i=tcands.begin();i!=tcands.end();++i)
                {
--- 214,221 ----
        
                // convert target candidates from internal data structure to 
the external one
        void ConvertTgtCand(const TgtCands& tcands,std::vector<StringTgtCand>& 
rv,
!                                                                               
        std::vector<StringWordAlignmentCand>& swa,
!                                                                               
        std::vector<StringWordAlignmentCand>& twa) const
        {
                for(TgtCands::const_iterator 
i=tcands.begin();i!=tcands.end();++i)
                {
***************
*** 221,227 ****
                        for(size_t j=0;j<iphrase.size();++j)
                                vs.push_back(&tv->symbol(iphrase[j]));
                        rv.push_back(StringTgtCand(vs,i->GetScores()));
!                       wa.push_back(i->GetAlignment());
                }
        }
  
--- 226,233 ----
                        for(size_t j=0;j<iphrase.size();++j)
                                vs.push_back(&tv->symbol(iphrase[j]));
                        rv.push_back(StringTgtCand(vs,i->GetScores()));
!                       
swa.push_back(StringWordAlignmentCand(vs,(i->GetSourceAlignment())));
!                       
twa.push_back(StringWordAlignmentCand(vs,(i->GetTargetAlignment())));
                }
        }
  
***************
*** 326,338 ****
        {
                
                Scores sc=tcand[i].GetScores();
!               std::string     trgAlign = tcand[i].GetAlignment();
                        
                const IPhrase& iphr=tcand[i].GetPhrase();
  
                out << i << " -- " << sc << " -- ";
                for(size_t j=0;j<iphr.size();++j)                       out << 
tv->symbol(iphr[j])<<" ";
!               out<< " -- " << trgAlign;               
                out << std::endl;
        }
  }
--- 332,348 ----
        {
                
                Scores sc=tcand[i].GetScores();
!               WordAlignments                  
srcAlign=tcand[i].GetSourceAlignment();
!               WordAlignments                  
trgAlign=tcand[i].GetTargetAlignment();
                        
                const IPhrase& iphr=tcand[i].GetPhrase();
  
                out << i << " -- " << sc << " -- ";
                for(size_t j=0;j<iphr.size();++j)                       out << 
tv->symbol(iphr[j])<<" ";
!               out<< " -- ";           
!               for (size_t j=0;j<srcAlign.size();j++)                  out << 
" " << srcAlign[j];
!               out << " -- ";
!               for (size_t j=0;j<trgAlign.size();j++)                  out << 
" " << trgAlign[j];
                out << std::endl;
        }
  }
***************
*** 390,396 ****
  void PhraseDictionaryTree::
  GetTargetCandidates(const std::vector<std::string>& src,
                                                                                
std::vector<StringTgtCand>& rv,
!                                                                               
std::vector<std::string>& wa) const 
  {
        IPhrase f(src.size());
        for(size_t i=0;i<src.size();++i) 
--- 400,407 ----
  void PhraseDictionaryTree::
  GetTargetCandidates(const std::vector<std::string>& src,
                                                                                
std::vector<StringTgtCand>& rv,
!                                                                               
std::vector<StringWordAlignmentCand>& swa,
!                                                                               
std::vector<StringWordAlignmentCand>& twa) const 
  {
        IPhrase f(src.size());
        for(size_t i=0;i<src.size();++i) 
***************
*** 401,407 ****
        
        TgtCands tgtCands;
        imp->GetTargetCandidates(f,tgtCands);
!       imp->ConvertTgtCand(tgtCands,rv,wa);
  }
  
  
--- 412,418 ----
        
        TgtCands tgtCands;
        imp->GetTargetCandidates(f,tgtCands);
!       imp->ConvertTgtCand(tgtCands,rv,swa,twa);
  }
  
  
***************
*** 480,489 ****
                const std::string &sourcePhraseString   =tokens[0]
                                                                                
        ,&targetPhraseString=tokens[1]
                                                                                
        ,&scoreString                           = tokens[2];            
!               const std::string empty;
!               const std::string &alignmentString = PrintWordAlignment() ? 
tokens[3] : empty;
                IPhrase f,e;
                Scores sc;
                        
                std::vector<std::string> wordVec = Tokenize(sourcePhraseString);
                for (size_t i = 0 ; i < wordVec.size() ; ++i)
--- 491,500 ----
                const std::string &sourcePhraseString   =tokens[0]
                                                                                
        ,&targetPhraseString=tokens[1]
                                                                                
        ,&scoreString                           = tokens[2];            
!                               
                IPhrase f,e;
                Scores sc;
+               WordAlignments sourceAlignment, targetAlignment;
                        
                std::vector<std::string> wordVec = Tokenize(sourcePhraseString);
                for (size_t i = 0 ; i < wordVec.size() ; ++i)
***************
*** 565,571 ****
                                abort();
                        }
                }
!               tgtCands.push_back(TgtCand(e,sc, alignmentString));
                assert(currFirstWord!=InvalidLabelId);
        }
    if (PrintWordAlignment())
--- 576,582 ----
                                abort();
                        }
                }
!               tgtCands.push_back(TgtCand(e,sc, sourceAlignment, 
targetAlignment));
                assert(currFirstWord!=InvalidLabelId);
        }
    if (PrintWordAlignment())
***************
*** 650,660 ****
  void PhraseDictionaryTree::
  GetTargetCandidates(PrefixPtr p,
                                                                                
std::vector<StringTgtCand>& rv,
!                                                                               
std::vector<std::string>& wa) const 
  {
        TgtCands tcands;
        imp->GetTargetCandidates(p,tcands);
!       imp->ConvertTgtCand(tcands,rv,wa);
  }
  
  std::string PhraseDictionaryTree::GetScoreProducerDescription() const{
--- 661,672 ----
  void PhraseDictionaryTree::
  GetTargetCandidates(PrefixPtr p,
                                                                                
std::vector<StringTgtCand>& rv,
!                                                                               
std::vector<StringWordAlignmentCand>& swa,
!                                                                               
std::vector<StringWordAlignmentCand>& twa) const 
  {
        TgtCands tcands;
        imp->GetTargetCandidates(p,tcands);
!       imp->ConvertTgtCand(tcands,rv,swa,twa);
  }
  
  std::string PhraseDictionaryTree::GetScoreProducerDescription() const{
diff -wcr moses-2010-09-24/moses/src/PhraseDictionaryTree.h 
moses-2010-09-24.svn/moses/src/PhraseDictionaryTree.h
*** moses-2010-09-24/moses/src/PhraseDictionaryTree.h   2010-10-20 
18:14:57.000000000 -0700
--- moses-2010-09-24.svn/moses/src/PhraseDictionaryTree.h       2010-09-24 
12:57:04.000000000 -0700
***************
*** 79,85 ****
        // get the target candidates for a given phrase
        void GetTargetCandidates(const std::vector<std::string>& src,
                                                                                
                         std::vector<StringTgtCand>& rv,
!                                                                               
                         std::vector<std::string>& wa) const;
  
        /*****************************
         *   access to prefix tree   *
--- 79,86 ----
        // get the target candidates for a given phrase
        void GetTargetCandidates(const std::vector<std::string>& src,
                                                                                
                         std::vector<StringTgtCand>& rv,
!                                                                               
                         std::vector<StringWordAlignmentCand>& swa,
!                                                                               
                         std::vector<StringWordAlignmentCand>& twa) const;
  
        /*****************************
         *   access to prefix tree   *
***************
*** 111,117 ****
                                                                                
                         std::vector<StringTgtCand>& rv) const;
        void GetTargetCandidates(PrefixPtr p,
                                                                                
                         std::vector<StringTgtCand>& rv,
!                                                                               
                         std::vector<std::string>& wa) const;
  
        // print target candidates for a given prefix pointer to a stream, 
mainly 
        // for debugging
--- 112,119 ----
                                                                                
                         std::vector<StringTgtCand>& rv) const;
        void GetTargetCandidates(PrefixPtr p,
                                                                                
                         std::vector<StringTgtCand>& rv,
!                                                                               
                         std::vector<StringWordAlignmentCand>& swa,
!                                                                               
                         std::vector<StringWordAlignmentCand>& twa) const;
  
        // print target candidates for a given prefix pointer to a stream, 
mainly 
        // for debugging
_______________________________________________
Moses-support mailing list
Moses-support@mit.edu
http://mailman.mit.edu/mailman/listinfo/moses-support

Reply via email to