#!/usr/bin/perl -w

# align_generate_cooc.pl
# Alex Fraser

use strict;

my $fn1 = shift;
my $fn2 = shift;
my $out_fn = shift;

die "usage: align_generate_cooc.pl SNT_FN OUT_COOC\n  or   align_generate_cooc.pl E_FN F_FN OUT_COOC\n"
    unless defined($fn2) and @ARGV==0;

my $use_SNT = 0;
if (not defined($out_fn)) {
    $out_fn = $fn2;
    $fn2 = undef;
    $use_SNT = 1;
}

system "rm -f $out_fn";

# this is set so each of the two hashes becomes about 0.5 GB
my $max_hash_size = 4000000;

my %hash1;
my $hash1_size = 0;
my %hash2;
my $hash2_size = 0;

my $english_words = 0;

if ($use_SNT) {
    open(SNT, $fn1) or die "failed to open SNT file $fn1\n";
}
else {
    open(E, $fn1) or die "failed to open E file $fn1\n";
    open(F, $fn2) or die "failed to open F file $fn2\n";
}

my $tmp_fn = "$out_fn.tmp_align_generate_cooc";
open(TMP, ">$tmp_fn") or die;
while (1) {
    my $english_line;
    my $french_line;
    if ($use_SNT) {
	my $num_line = <SNT>;
	last if not defined($num_line);
	$english_line = <SNT>;
	die "FATAL: expected english line" if not defined($english_line);
	$french_line = <SNT>;
	die "FATAL: expected french line" if not defined($french_line);
    }
    else {
	$english_line = <E>;
	$french_line = <F>;
	last if (not defined($english_line) and not defined($french_line));
	die "FATAL: e or f file is truncated" if (not defined($english_line) or not defined($french_line));
    }

    chomp($english_line);
    chomp($french_line);

    my @english_tokens = split(' ', $english_line);
    my @french_tokens = split(' ', $french_line);

    $english_words += scalar(@english_tokens);

    unshift(@english_tokens, 0);

    for (my $i = 0; $i < @english_tokens; $i++) {
	my $e = $english_tokens[$i]+0;
	for (my $j = 0; $j < @french_tokens; $j++) {
	    my $f = $french_tokens[$j]+0;

	    my $e_f = pack("LL", $e, $f);

	    ## hash1 stays in memory, fill it until full
	    if (exists $hash1{$e_f}) {
		#NOOP
	    }
	    ## hash2 is periodically dumped
	    elsif (exists $hash2{$e_f}) {
		#NOOP
	    }
	    elsif ($hash1_size < $max_hash_size) {
		$hash1{$e_f} = undef;
		$hash1_size++;
	    }
	    elsif ($hash2_size < $max_hash_size) {
		$hash2{$e_f} = undef;
		$hash2_size++;
		if ($hash2_size % 1000000 == 0) {
		    print STDERR "hash2 size $hash2_size english words $english_words\n";
		}
	    }
	    else {
		$hash2{$e_f} = undef;
		print STDERR "clearing hash2 (size $max_hash_size) at english words $english_words\n";
		for my $k (keys %hash2) {
		    my ($ke, $kf) = unpack ("LL", $k);
		    print TMP "$ke $kf\n";
		}
		%hash2 = ();
		$hash2_size = 0;
	    }
	}
    }
}

for my $k (keys %hash1) {
    my ($e, $f) = unpack ("LL", $k);
    print TMP "$e $f\n";
}
for my $k (keys %hash2) {
    my ($e, $f) = unpack ("LL", $k);
    print TMP "$e $f\n";
}
close(TMP) or die;

system "sort -k1,1n -k2,2n -u $tmp_fn > $out_fn";
unlink($tmp_fn);
