Dear Himanshu,
here's the script of mine: collins_to_deptree_as_factors.pl
Hope it helps,
Ondrej.
On Thu, 6 Aug 2009, [email protected] wrote:
> Dear All,
>
> I am new to moses and was looking to use it for the alignment of dependency
> trees, I came across this page while searching for some previous work.
> http://www.statmt.org/jhuws/?n=Resources.Tools
>
> I believe it could be very helpful for me, but I haven't been able to access
> the script or download it. I searched the CVS and also checked out all the
> braches from svn, but I cannot find the script indicated as:-
> /export/ws06osmt/bin/collins_to_deptree_as_factors.pl
> or the jar file given as:-
> /export/ws06osmt/bin/extract-factors.jar
>
> I would relly appreciate if anypne could point me in the right direction.
> Please help me.
>
> Thank you,
> Himanshu Shivhare
>
>
>
> --
> Click for exclusive coverage on the New Bajaj Pulsar 220 the fastest Indian
> bike
> http://www.zigwheels.com/Features/Bajaj-Pulsar-220-DTSi-Special-Coverage/Pulsar_20090623-1-1
> _______________________________________________
> Moses-support mailing list
> [email protected]
> http://mailman.mit.edu/mailman/listinfo/moses-support
>
#!/usr/bin/perl
# Converts output of Collin's parser to factored text. Parses are encoded
# as dependency trees. '|' gets escaped as '|'
#
# Automatically fixes problems with sentences containing '(', ')', or '/'.
#
# Sample input:
#
# PROB 277 -15.5908 0
# TOP -15.5908 NP -4.44925 NPB -3.94209 DT 0 The
# JJ 0 Trial
# (TOP~Trial~1~1 (NPB~Trial~2~2 The/DT Trial/JJ ) )
# TIME 0
# PROB 148 -9.86947 0
# TOP -9.86947 NP -3.96667 NPB -3.72988 NNP 0 Franz
# NNP 0 Kafka
# (TOP~Kafka~1~1 (NPB~Kafka~2~2 Franz/NNP Kafka/NNP ) )
#
# Sample output:
#
# The|DT|t|1|2 Trial|JJ|NPB|2|0
# Franz|NNP|t|1|2 Kafka|NNP|NPB|2|0
#
# "t" stands for "terminal"
#
# Ondrej Bojar, [email protected]
use strict;
my $wc; # global variable :-(,
my @words; # global variable :-(
my @gov; # global variable :-(
my @phrasetype; # global variable :-(
sub unescape_collins_form {
my $f = shift;
return "/" if $f eq "-SLASH-";
return "(" if $f eq "-LRB-";
return ")" if $f eq "-RRB-";
return "{" if $f eq "-LCB-";
return "}" if $f eq "-RCB-";
return "[" if $f eq "-LSB-"; ## ok?
return "]" if $f eq "-RSB-"; ## ok?
return $f;
}
my $err = 0;
my $nl = 0;
my $edges;
my $logprob;
my $sentnum = 0;
while (<>) {
$nl++;
print STDERR "." if $nl % 10000 == 0;
print STDERR "($nl)" if $nl % 100000 == 0;
chomp;
# We do not report the probability, do not read it
#if (/^PROB/) {
# if (/^PROB ([0-9]+) ([-.0-9]+) 0 *$/) {
# $edges = $1;
# $logprob = $2;
# } else {
# print STDERR "$nl: Malformed PROB: $_\n";
# $err = 1;
# }
# next;
#}
if (/^\(TOP/) {
$sentnum++;
@phrasetype = ();
convert($_, $sentnum);
}
}
print STDERR "Done.\n";
exit 1 if $err;
sub convert {
# converts the given sentence, exactly one sentence must be given
my $s = shift;
my $sentnum = shift;
$s =~ s/\n/ /g;
# Safety code to protect (). Works only if ( and ) are separate tokens,
# not combined with anything else.
$s =~ s/\(([~\/])/-LRB-\1/g;
$s =~ s/\)([~\/])/-RRB-\1/g;
$s =~ s/\/([~\/])/-SLASH-\1/g;
# This should have been local here, but Perl fails to reinitialize the
# variables when convert is called for the second time. This leads to
# too high values of $wc and too many members in @words.
my @words = ();
my $wc = 0;
# my @gov = ();
# sub getword {
# my $s = shift;
# push @words, $s;
# print "$s ---> $wc\n";
# $wc++;
# return $wc;
# }
# $s =~ s/ ([^ ()]*) /getword $1/eg;
# $s =~ s/\(([^()]*)\)/getword $1/eg;
$s =~ s/[ \t]+/ /g;
my @toks = split / /, $s;
my @numbered = map { if (/\//) {$wc++; push @words, $_; $wc} else {$_} } @toks;
my $s = join(" ", @numbered);
# print "NUMBERED $s\n";
sub setgov {
my $s = shift;
my @s = split /[\t ]+/, $s;
my $bracket = shift @s;
my ($phrasetype, $govform, $nchildren, $govidx_wrong)
= split /~/, $bracket;
# Collins does not count PUNC children, let's fix it
my $govidx = 0;
for(my $w_wrong=0; $govidx<=$#s; $govidx++) {
last if $w_wrong == $govidx_wrong;
$w_wrong ++ if $s[$govidx] !~ /\/PUNC/;
}
$govidx--;
my $gov = $s[$govidx];
for(my $i=0; $i<=$#s; $i++) {
next if $i == $govidx;
my $dep = $s[$i];
# print "$s (bracket $bracket, govidx $govidx): Gov of $dep is $gov\n";
$gov[$dep] = $gov;
}
$phrasetype[$gov] = $phrasetype if !defined $phrasetype[$gov];
return $gov;
}
while ($s =~ /\(/) {
my $replaced = ($s =~ s/\(([^()]*)\)/setgov $1/eg);
die "Malformed input. Failed to extract phrase from: $s" if !$replaced;
}
setgov($s);
$gov[$s] = 0; # set the root node;
#print "<s sentnum=\"$sentnum\" edges=\"$edges\" logprob=\"$logprob\">\n";
for(my $i=0; $i<= $#words; $i++) {
# print "$i: $words[$i]\n";
my ($form, $tag) = split /\//, $words[$i];
my $form = unescape_collins_form($form);
my $ord = $i+1;
my $gov = $gov[$ord];
my $phr = $phrasetype[$ord];
$phr = "t" if !defined $phr;
# print "<f>$form<t>$tag<phr>$phr<r>$ord<g>$gov\n";
print join("|", map {s/\|/|/g; $_;} ($form, $tag, $phr, $ord, $gov));
print " " if $i<$#words;
}
# print "</s>\n";
print "\n";
}
_______________________________________________
Moses-support mailing list
[email protected]
http://mailman.mit.edu/mailman/listinfo/moses-support