Repository: incubator-joshua Updated Branches: refs/heads/master 6c48a9753 -> 033f18f82
added filtering to the top 100 rules Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/033f18f8 Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/033f18f8 Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/033f18f8 Branch: refs/heads/master Commit: 033f18f825f787c7af015c428c448b899c414827 Parents: 6c48a97 Author: Matt Post <p...@cs.jhu.edu> Authored: Thu Oct 13 16:35:21 2016 -0400 Committer: Matt Post <p...@cs.jhu.edu> Committed: Thu Oct 13 16:35:21 2016 -0400 ---------------------------------------------------------------------- scripts/training/filter-rules.pl | 42 ++++++++++++++++++++++++++++++----- scripts/training/pipeline.pl | 2 +- 2 files changed, 38 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/033f18f8/scripts/training/filter-rules.pl ---------------------------------------------------------------------- diff --git a/scripts/training/filter-rules.pl b/scripts/training/filter-rules.pl index 8b74b52..68b88ee 100755 --- a/scripts/training/filter-rules.pl +++ b/scripts/training/filter-rules.pl @@ -1,14 +1,32 @@ #!/usr/bin/env perl # Removes rules from phrase tables and grammars according to various criteria. +# +# October 2016. It turns out that Thrax keeps *all* rules it finds for each source +# side. In big bitexts, this can mean that common words collect tens or even hundreds +# of thousands of translation options, due to garbage collection (Moore, 2004), all of +# which are then retained. These can be filtered out by this script, which will reduce +# the grammar to contain only the top 100 translation options (by count) for each source +# side. You just need to provide the field that contains the "Rarity Penalty" computed +# by thrax. This is field 3 (0-indexed) by default. To filter in this way: +# +# gzip -cd grammar.gz | filter-rules.pl -t 100 -f 3 | gzip -9n > grammar-filtered.gz +# +# You can also filter by using the model weights, say after tuning: +# +# gzip -cd grammar.gz | filter-rules.pl -t 100 -c /path/to/joshua.config -o pt ... +# +# Really this should just be built into Thrax, which could use the rarity penalty there. use strict; use warnings; use List::Util qw/max sum/; use Getopt::Std; -my %opts = ( t => 100 ); -my $ret = getopts("bps:uvc:t:o:", \%opts); +my %opts = ( + f => 3, # default field for rarity penalty is 3 (0-indexed) +); +my $ret = getopts("bps:uvc:t:o:f:", \%opts); if (!$ret) { print "Usage: filter-rules.pl [-u] [-s SCOPE] [-v]\n"; @@ -17,10 +35,10 @@ if (!$ret) { print " -s SCOPE: remove rules with scope > SCOPE (Hopkins & Langmead, 2010)\n"; print " -u: remove abstract unary rules\n"; print " -v: be verbose\n"; + print " -t: only include top N candidates (requires either -f or (-c and -o)\n"; + print " -f: rarity penalty field to use when filtering (index or name) to -t without -c (default:3)\n"; print " -c: path to joshua config file\n"; print " -o: grammar owner (required for -t)\n"; - print " -t: only include top N candidates by weight (requires config file)\n"; - print " -f: score field to use when filtering (index or name) to -t without -c\n"; exit; } @@ -110,7 +128,7 @@ sub filter_and_print_rules { my @rules = @$rulelist; my @filtered_rules = (); - if ($opts{c}) { + if ($opts{t} and $opts{c}) { my %scores; foreach my $rule (@rules) { my @tokens = split(/ \|\|\| /, $rule); @@ -133,6 +151,20 @@ sub filter_and_print_rules { @filtered_rules = splice(@sorted_rules, 0, $opts{t}); $SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules); + } elsif ($opts{t} and $opts{f}) { + # Filter using field f (0-indexed), which is assumed to be the rarity penalty field + my %rarities; + foreach my $rule (@rules) { + my @tokens = split(/ \|\|\| /, $rule); + my $features = $tokens[3]; + my @features = split(" ", $features); + my $rarity = $features[$opts{f}] || 1.0; + $rarities{$rule} = 1-log($rarity); # Thrax sets rarity = exp(1-count(e,f)), sigh + } + my @sorted_rules = sort { $rarities{$b} <=> $rarities{$a} } keys(%rarities); + @filtered_rules = splice(@sorted_rules, 0, $opts{t}); + $SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules); + } else { @filtered_rules = @rules; } http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/033f18f8/scripts/training/pipeline.pl ---------------------------------------------------------------------- diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl index e653cf9..37e1868 100755 --- a/scripts/training/pipeline.pl +++ b/scripts/training/pipeline.pl @@ -1154,7 +1154,7 @@ if (! defined $GRAMMAR_FILE) { system("mv $thrax_file.tmp $thrax_file"); $cachepipe->cmd("thrax-run", - "hadoop jar $THRAX/bin/thrax.jar -D mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; hadoop fs -getmerge $THRAXDIR/final/ grammar.gz", + "hadoop jar $THRAX/bin/thrax.jar -D mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; hadoop fs -cat $THRAXDIR/final/* | tee grammar-unfiltered.gz | gzip -cd | $JOSHUA/scripts/training/filter-rules.pl -t 100 -c 3 | gzip -9n > grammar.gz", "$DATA_DIRS{train}/thrax-input-file", $thrax_file, "grammar.gz");