Dear Himanshu,

here's the script of mine: collins_to_deptree_as_factors.pl

Hope it helps,
  Ondrej.

On Thu, 6 Aug 2009, [email protected] wrote:

> Dear All,
> 
> I am new to moses and was looking to use it for the alignment of dependency 
> trees, I came across this page while searching for some previous work. 
> http://www.statmt.org/jhuws/?n=Resources.Tools
> 
> I believe it could be very helpful for me, but I haven't been able to access 
> the script or download it. I searched the CVS and also checked out all the 
> braches from svn, but I cannot find the script indicated as:-
> /export/ws06osmt/bin/collins_to_deptree_as_factors.pl
> or the jar file given as:-
> /export/ws06osmt/bin/extract-factors.jar
> 
> I would relly appreciate if anypne could point me in the right direction. 
> Please help me.
> 
> Thank you,
> Himanshu Shivhare
> 
> 
> 
> --
> Click for exclusive coverage on the New Bajaj Pulsar 220 the fastest Indian 
> bike
> http://www.zigwheels.com/Features/Bajaj-Pulsar-220-DTSi-Special-Coverage/Pulsar_20090623-1-1
> _______________________________________________
> Moses-support mailing list
> [email protected]
> http://mailman.mit.edu/mailman/listinfo/moses-support
> 
#!/usr/bin/perl
# Converts output of Collin's parser to factored text. Parses are encoded
# as dependency trees. '|' gets escaped as '|'
#
# Automatically fixes problems with sentences containing '(', ')', or '/'.
#
# Sample input:
#
# PROB 277 -15.5908 0 
# TOP -15.5908 NP -4.44925 NPB -3.94209 DT 0 The
#            JJ 0 Trial
# (TOP~Trial~1~1 (NPB~Trial~2~2 The/DT Trial/JJ ) ) 
# TIME 0
# PROB 148 -9.86947 0 
# TOP -9.86947 NP -3.96667 NPB -3.72988 NNP 0 Franz
#            NNP 0 Kafka
# (TOP~Kafka~1~1 (NPB~Kafka~2~2 Franz/NNP Kafka/NNP ) ) 
#
# Sample output:
#
# The|DT|t|1|2 Trial|JJ|NPB|2|0
# Franz|NNP|t|1|2 Kafka|NNP|NPB|2|0
#
# "t" stands for "terminal"
#
# Ondrej Bojar, [email protected]


use strict;

my $wc; # global variable :-(, 
my @words; # global variable :-(
my @gov; # global variable :-(
my @phrasetype; # global variable :-(

sub unescape_collins_form {
  my $f = shift;
  return "/" if $f eq "-SLASH-";
  return "(" if $f eq "-LRB-";
  return ")" if $f eq "-RRB-";
  return "{" if $f eq "-LCB-";
  return "}" if $f eq "-RCB-";
  return "[" if $f eq "-LSB-"; ## ok?
  return "]" if $f eq "-RSB-"; ## ok?
  return $f;
}

my $err = 0;
my $nl = 0;
my $edges;
my $logprob;
my $sentnum = 0;
while (<>) {
  $nl++;
  print STDERR "." if $nl % 10000 == 0;
  print STDERR "($nl)" if $nl % 100000 == 0;
  chomp;
  
  # We do not report the probability, do not read it
  #if (/^PROB/) {
  #  if (/^PROB ([0-9]+) ([-.0-9]+) 0 *$/) {
  #    $edges = $1;
  #    $logprob = $2;
  #  } else {
  #    print STDERR "$nl: Malformed PROB: $_\n";
  #    $err = 1;
  #  }
  #  next;
  #}
  if (/^\(TOP/) {
    $sentnum++;
    @phrasetype = ();
    convert($_, $sentnum);
  }
}
print STDERR "Done.\n";

exit 1 if $err;




sub convert {
  # converts the given sentence, exactly one sentence must be given
  my $s = shift;
  my $sentnum = shift;

  $s =~ s/\n/ /g;

  # Safety code to protect (). Works only if ( and ) are separate tokens,
  # not combined with anything else.
  $s =~ s/\(([~\/])/-LRB-\1/g;
  $s =~ s/\)([~\/])/-RRB-\1/g;
  $s =~ s/\/([~\/])/-SLASH-\1/g;
  
  # This should have been local here, but Perl fails to reinitialize the
  # variables when convert is called for the second time. This leads to
  # too high values of $wc and too many members in @words.
  my @words = ();
  my $wc = 0;
  # my @gov = ();
  # sub getword {
    # my $s = shift;
    # push @words, $s;
    # print "$s  ---> $wc\n";
    # $wc++;
    # return $wc;
  # }
  # $s =~ s/ ([^ ()]*) /getword $1/eg;
  # $s =~ s/\(([^()]*)\)/getword $1/eg;
  $s =~ s/[ \t]+/ /g;
  my @toks = split / /, $s;
  my @numbered = map {  if (/\//) {$wc++; push @words, $_; $wc} else {$_} } @toks;
  my $s = join(" ", @numbered);
  # print "NUMBERED $s\n";

  sub setgov {
    my $s = shift;
    my @s = split /[\t ]+/, $s;
    my $bracket = shift @s;
    my ($phrasetype, $govform, $nchildren, $govidx_wrong)
      = split /~/, $bracket;
    # Collins does not count PUNC children, let's fix it

    my $govidx = 0;
    for(my $w_wrong=0; $govidx<=$#s; $govidx++) {
      last if $w_wrong == $govidx_wrong;
      $w_wrong ++ if $s[$govidx] !~ /\/PUNC/;
    }
    $govidx--;
    my $gov = $s[$govidx];
    for(my $i=0; $i<=$#s; $i++) {
      next if $i == $govidx;
      my $dep = $s[$i];
      # print "$s (bracket $bracket, govidx $govidx): Gov of $dep is $gov\n";
      $gov[$dep] = $gov;
    }
    $phrasetype[$gov] = $phrasetype if !defined $phrasetype[$gov];
    return $gov;
  }
  while ($s =~ /\(/) {
    my $replaced = ($s =~ s/\(([^()]*)\)/setgov $1/eg);
    die "Malformed input. Failed to extract phrase from: $s" if !$replaced;
  }
  setgov($s);
  $gov[$s] = 0; # set the root node;

  #print "<s sentnum=\"$sentnum\" edges=\"$edges\" logprob=\"$logprob\">\n";
  for(my $i=0; $i<= $#words; $i++) {
    # print "$i: $words[$i]\n";
    my ($form, $tag) = split /\//, $words[$i];
    my $form = unescape_collins_form($form);
    my $ord = $i+1;
    my $gov = $gov[$ord];
    my $phr = $phrasetype[$ord];
    $phr = "t" if !defined $phr;
    # print "<f>$form<t>$tag<phr>$phr<r>$ord<g>$gov\n";
    print join("|", map {s/\|/&#124;/g; $_;} ($form, $tag, $phr, $ord, $gov));
    print " " if $i<$#words;
  }
  # print "</s>\n";
  print "\n";

}
_______________________________________________
Moses-support mailing list
[email protected]
http://mailman.mit.edu/mailman/listinfo/moses-support

Reply via email to