Repository: incubator-joshua
Updated Branches:
  refs/heads/master 6c48a9753 -> 033f18f82


added filtering to the top 100 rules


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/033f18f8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/033f18f8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/033f18f8

Branch: refs/heads/master
Commit: 033f18f825f787c7af015c428c448b899c414827
Parents: 6c48a97
Author: Matt Post <p...@cs.jhu.edu>
Authored: Thu Oct 13 16:35:21 2016 -0400
Committer: Matt Post <p...@cs.jhu.edu>
Committed: Thu Oct 13 16:35:21 2016 -0400

----------------------------------------------------------------------
 scripts/training/filter-rules.pl | 42 ++++++++++++++++++++++++++++++-----
 scripts/training/pipeline.pl     |  2 +-
 2 files changed, 38 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/033f18f8/scripts/training/filter-rules.pl
----------------------------------------------------------------------
diff --git a/scripts/training/filter-rules.pl b/scripts/training/filter-rules.pl
index 8b74b52..68b88ee 100755
--- a/scripts/training/filter-rules.pl
+++ b/scripts/training/filter-rules.pl
@@ -1,14 +1,32 @@
 #!/usr/bin/env perl
 
 # Removes rules from phrase tables and grammars according to various criteria.
+#
+# October 2016. It turns out that Thrax keeps *all* rules it finds for each 
source
+# side. In big bitexts, this can mean that common words collect tens or even 
hundreds
+# of thousands of translation options, due to garbage collection (Moore, 
2004), all of
+# which are then retained. These can be filtered out by this script, which 
will reduce
+# the grammar to contain only the top 100 translation options (by count) for 
each source
+# side. You just need to provide the field that contains the "Rarity Penalty" 
computed
+# by thrax. This is field 3 (0-indexed) by default. To filter in this way:
+#
+# gzip -cd grammar.gz | filter-rules.pl -t 100 -f 3 | gzip -9n > 
grammar-filtered.gz
+#
+# You can also filter by using the model weights, say after tuning:
+#
+# gzip -cd grammar.gz | filter-rules.pl -t 100 -c /path/to/joshua.config -o pt 
...
+#
+# Really this should just be built into Thrax, which could use the rarity 
penalty there.
 
 use strict;
 use warnings;
 use List::Util qw/max sum/;
 use Getopt::Std;
 
-my %opts = ( t => 100 );
-my $ret = getopts("bps:uvc:t:o:", \%opts);
+my %opts = ( 
+  f => 3, # default field for rarity penalty is 3 (0-indexed)
+);
+my $ret = getopts("bps:uvc:t:o:f:", \%opts);
 
 if (!$ret) {
   print "Usage: filter-rules.pl [-u] [-s SCOPE] [-v]\n";
@@ -17,10 +35,10 @@ if (!$ret) {
   print "   -s SCOPE: remove rules with scope > SCOPE (Hopkins & Langmead, 
2010)\n";
   print "   -u: remove abstract unary rules\n";
   print "   -v: be verbose\n";
+  print "   -t: only include top N candidates (requires either -f or (-c and 
-o)\n";
+  print "   -f: rarity penalty field to use when filtering (index or name) to 
-t without -c (default:3)\n";
   print "   -c: path to joshua config file\n";
   print "   -o: grammar owner (required for -t)\n";
-  print "   -t: only include top N candidates by weight (requires config 
file)\n";
-  print "   -f: score field to use when filtering (index or name) to -t 
without -c\n";
   exit;
 }
 
@@ -110,7 +128,7 @@ sub filter_and_print_rules {
   my @rules = @$rulelist;
 
   my @filtered_rules = ();
-  if ($opts{c}) {
+  if ($opts{t} and $opts{c}) {
     my %scores;
     foreach my $rule (@rules) {
       my @tokens = split(/ \|\|\| /, $rule);
@@ -133,6 +151,20 @@ sub filter_and_print_rules {
     @filtered_rules = splice(@sorted_rules, 0, $opts{t});
     $SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules);
 
+  } elsif ($opts{t} and $opts{f}) {
+    # Filter using field f (0-indexed), which is assumed to be the rarity 
penalty field
+    my %rarities;
+    foreach my $rule (@rules) {
+      my @tokens = split(/ \|\|\| /, $rule);
+      my $features = $tokens[3];
+      my @features = split(" ", $features);
+      my $rarity = $features[$opts{f}] || 1.0;
+      $rarities{$rule} = 1-log($rarity); # Thrax sets rarity = 
exp(1-count(e,f)), sigh
+    }
+    my @sorted_rules = sort { $rarities{$b} <=> $rarities{$a} } 
keys(%rarities);
+    @filtered_rules = splice(@sorted_rules, 0, $opts{t});
+    $SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules);
+
   } else {
     @filtered_rules = @rules;
   }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/033f18f8/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index e653cf9..37e1868 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1154,7 +1154,7 @@ if (! defined $GRAMMAR_FILE) {
     system("mv $thrax_file.tmp $thrax_file");
 
     $cachepipe->cmd("thrax-run",
-                    "hadoop jar $THRAX/bin/thrax.jar -D 
mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D 
mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR 
$thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; hadoop fs 
-getmerge $THRAXDIR/final/ grammar.gz",
+                    "hadoop jar $THRAX/bin/thrax.jar -D 
mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D 
mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR 
$thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; hadoop fs 
-cat $THRAXDIR/final/* | tee grammar-unfiltered.gz | gzip -cd | 
$JOSHUA/scripts/training/filter-rules.pl -t 100 -c 3 | gzip -9n > grammar.gz",
                     "$DATA_DIRS{train}/thrax-input-file",
                     $thrax_file,
                     "grammar.gz");

Reply via email to