#!/usr/bin/perl

use strict;
use warnings;
use constant MAX_X_GRIDPTS => 70;
use constant YGRID => 40;
use constant SYMBOL => "+";

sub make_data_hash {
    #reads in a named file with 2 free format sorted columns of paired x-y data
    #stores each point in a hash, and stores hash refs in an array
    my @args = @_;
    #conditional assignment
    my $filename = $args[0] || "-";
    my @data_array= ();
    open (DATA, $filename);
    while (<DATA>){
	my $line = $_;
	#kill all leading whitespace?
	$line =~ s/^\s+//g;
	my @list = split /\s+/, $line;
	if (scalar(@list) == 1){
	    $list[1] = 0;
	}
	push @data_array, { $list[0] => $list[1]  };
    }
    close (DATA);
    return \@data_array;
}

sub analyze_data {
#looks at data, finds limits, and basic stats.
    my @args = @_;
    my $data_array_ref =$args[0];
    my @data_array = @$data_array_ref;
    my $x_max = -1E99;
    my $x_min = 1E99;
    my $y_max = -1E99;
    my $y_min = 1E99;
    my $x_sum = 0;
    my $y_sum = 0;
    my $xx_sum = 0;
    my $yy_sum = 0;
    my $num_points = scalar(@data_array) ;
    my $x_mean = 0;
    my $x_sd = 0;
    my $y_mean = 0;
    my $y_sd = 0;
    my $xx_mean = 0;
    my $yy_mean = 0;
    my %hash_summary = ();
    for (my $i = 0; $i < $num_points; $i++){
	my $hash_ref=$data_array[$i];
	my %hash=%$hash_ref;
	my @x_key = keys (%hash);
	my $x_curr = $x_key[0];
	my @y_key = values (%hash);
	my $y_curr = $y_key[0];
	if ($x_curr > $x_max ){
	    $x_max = $x_curr;
	}
	if ($x_curr < $x_min ){
	    $x_min = $x_curr;
	}
	if ($y_curr > $y_max ){
	    $y_max = $y_curr;
	}
	if ($y_curr < $y_min ){
	    $y_min = $y_curr;
	}
	$x_sum = $x_sum + $x_curr;
	$y_sum = $y_sum + $y_curr;
	$xx_sum = $xx_sum + ($x_curr * $x_curr);
	$yy_sum = $yy_sum + ($y_curr * $y_curr);
    }
    $x_mean = $x_sum / $num_points; 
    $y_mean = $y_sum / $num_points; 
    $xx_mean = $xx_sum / $num_points; 
    $yy_mean = $yy_sum / $num_points; 
    #SD by sample population
    $x_sd = sqrt($xx_mean - ($x_mean*$x_mean));
    $y_sd = sqrt($yy_mean - ($y_mean*$y_mean));
    $hash_summary{ 'xmax' } = $x_max;
    $hash_summary{ 'ymax' } = $y_max;
    $hash_summary{ 'xmin' } = $x_min;
    $hash_summary{ 'ymin' } = $y_min;
    $hash_summary{ 'xbar' } = $x_mean;
    $hash_summary{ 'ybar' } = $y_mean;
    $hash_summary{ 'sigx' } = $x_sd;
    $hash_summary{ 'sigy' } = $y_sd;
    $hash_summary{ 'points' } = $num_points;

    return \%hash_summary;
}


sub print_stats{
    #takes a hash by ref and prints keys/values to stdout
    my @args = @_;
    my $hash_ref=$args[0];
    my %stat_hash=%$hash_ref;
    my $points = $stat_hash{points};
    printf ("Data on %.6s points:\n", $points);
    printf ("XLIM: %6.6s  <  X  <  %6.6s  Mean: %6.6s  Sig: %6.6s\n", $stat_hash{xmin},$stat_hash{xmax},$stat_hash{xbar},$stat_hash{sigx});
    printf ("YLIM: %6.6s  <  Y  <  %6.6s  Mean: %6.6s  Sig: %6.6s\n", $stat_hash{ymin},$stat_hash{ymax},$stat_hash{ybar},$stat_hash{sigy});
}
sub print_hash {
    #takes a hash by ref and prints keys/values to stdout
    my @args = @_;
    my $hash_ref=$args[0];
    my %hash=%$hash_ref;
    my @keys = keys (%hash);
    my @values = values (%hash);
    for (my $i = 0 ; $i < scalar (@keys) ; $i++){
	my $key = $keys[$i];
	my $value = $values[$i];
	print $key."     ".$value."\n";
    }
}

sub create_grid {
#creates a 2D array of hashes with the boundaries of each "box" of the final grid
#stored as hash values.
    my @args = @_;
    my $x_grid = $args[0];
    my $y_grid = $args[1];
    my $stat_ref = $args[2];
    my %stat_hash = %$stat_ref;
    my $xmin = $stat_hash{xmin};
    my $xmax = $stat_hash{xmax};
    my $ymin = $stat_hash{ymin};
    my $ymax = $stat_hash{ymax};
    my $x_range = $xmax - $xmin;
    my $y_range = $ymax - $ymin;
    my $x_bins = $x_range / $x_grid;
    my $y_bins = $y_range / $y_grid;
    my @grid_array = ();
    #initialize a 2D array of hashes, x first(by row), y second(by column)
    for (my $i = 0 ; $i < $x_grid ; $i++){
	my @column_array = ();
	for ( my $j = 0 ; $j < $y_grid ; $j++){
	    my $box_hash_ref;
	    $box_hash_ref->{box} = $i.$j;
	    $box_hash_ref->{minx} = $xmin + $i * $x_bins;
	    $box_hash_ref->{maxx} = $xmin + ($i + 1) * $x_bins;
	    $box_hash_ref->{miny} = $ymin + $j * $y_bins;
	    $box_hash_ref->{maxy} = $ymin + ($j + 1) * $y_bins;
	    # some hack for boundary boxes
	    if( $j == ($y_grid -1 )){
		$box_hash_ref->{endy} = "YES";
	    }else{
		$box_hash_ref->{endy} = "NO";
	    }
	    if( $i == ($x_grid -1 )){
		$box_hash_ref->{endx} = "YES";
	    }else{
		$box_hash_ref->{endx} = "NO";
	    }
	    push @column_array, $box_hash_ref; 
	}
	push @grid_array, \@column_array;
    }

    return \@grid_array;
}

sub print_2D_array_of_hashes {
    my @args = @_;
    my @array = @{$args[0]};
    my $num_columns = scalar @array;
    for ( my $i = 0; $i < $num_columns; $i++ ){
        my @column = @{$array[$i]};
        my $num_elem = scalar @column;
        for (my $j = 0; $j<$num_elem;$j++){
	    my %hash = %{$column[$j]};
	    print $i." ".$j."     ";
            print %hash;
	    print "\n";
        }
    }
}


sub bin_data {
    #calculates the mean of data within each bin
    my @args = @_;
    my $data_array_ref =$args[0];
    my @data_array = @$data_array_ref;
    my $grid_array_ref =$args[1];
    my @grid_array =@$grid_array_ref;
    my $num_bins = scalar (@grid_array);
    for (my $i = 0; $i < $num_bins ; $i++){ 
	my $xlow = $grid_array[$i][0]->{minx};
	my $xhigh = $grid_array[$i][0]->{maxx};
	my $total = 0;
	my $num_elements = 0;
	#for each bin, loop through the data with matching x values and bin.
	#this also eliminates the need for sorting, so it's slow, but works
	for (my $j = 0; $j < scalar @data_array; $j++){
	    my @hash_keys = keys ( %{$data_array[$j]});
	    my @hash_vals = values ( %{$data_array[$j]});
	    my $hash_x = $hash_keys[0];
	    my $hash_y = $hash_vals[0];
	    if ( $hash_x >= $xlow && $hash_x < $xhigh ){
		$total = $total + $hash_y;
		$num_elements++;
	    }
	    if ($grid_array[$i][0]->{"endx"} eq "YES" && $hash_x >= $xlow && $hash_x <= $xhigh ){
		$total = $total + $hash_y;
		$num_elements++;
	    }

	}
	#average all y values in the bin
	my $bin_value = 0;
	if ($num_elements != 0){
	    $bin_value = $total / $num_elements;
	} 
	&associate_value($grid_array_ref,$i,$bin_value);
    }
#    &print_2D_array_of_hashes($grid_array_ref);
}

sub associate_value{
    #associates the bin value with the appropriate grid "box"
    my @args = @_;
    my $grid_array_ref=$args[0];
    my $column_index=$args[1];
    my $bin_value=$args[2];
    my @grid_array =@$grid_array_ref;
    my @column=@{$grid_array[$column_index]};
    for (my $i =0 ; $i < scalar @column; $i++){
	my $ylow = $column[$i]->{miny};
	my $yhigh = $column[$i]->{maxy};
	if ($bin_value >= $ylow && $bin_value < $yhigh){
	    $column[$i]->{plot_val} = $bin_value;
	} elsif ($column[$i]->{endy} eq "YES" && $bin_value >= $ylow && $bin_value <= $yhigh){
	    $column[$i]->{plot_val} = $bin_value;
	} else {
	    $column[$i]->{plot_val} = 0;
	}
    }
}



sub calculate_bins {
    # fits the range of x into a max number of bins
    my @args = @_;
    my $num_points = $args[0];
    my $max_x_gridpts = $args[1];
    for (my $i = 1 ; $i > -1 ; $i++){
	my $test_bins = $num_points / $i;
	#hack for conversion to int, only floats will have more than 1 element
	my @numbers = split /"."/ , $test_bins;
	my $int = $numbers[0];
	my $dec = 0;
	if (scalar (@numbers) >1 && $test_bins <= $max_x_gridpts){
	    return ($int+1);
	}elsif ( $test_bins <= $max_x_gridpts ){
	    return  $int;
	}
    }
}

sub grid_plot {
    #actual plotting sub -- goes across x by y outputting symbols.
    my @args = @_;
    my $grid_array_ref=$args[0];
    my @grid_array=@{$grid_array_ref};
    my $xgrid = scalar @grid_array;
    my $ygrid = scalar @{$grid_array[0]};
    for (my $y= ($ygrid - 1); $y >= 0; $y--){
	for (my $x = 0; $x < $xgrid; $x++){
	    my @column = @{$grid_array[$x]};
	    my %box = %{$column[$y]};
	    my $box_val = $box{plot_val};
	    # y-axis values are average of limits
	    my $y_axis = ($box{maxy} + $box{miny})/2;
	    #print out the y axis values in fixed width

	    if ($x == 0){
		printf ("%9.9s|", &format_number ($y_axis,9));
	    }
	    #print a symbol or a blank
	    if ($box_val != 0){
		print SYMBOL;
	    } else {
		print " ";
	    }
	}
	print "\n";
    }
    &print_x_axis($grid_array_ref);
}

sub print_x_axis {
    my @args = @_;
    my $grid_array_ref=$args[0];
    my @grid_array=@{$grid_array_ref};
    my $xgrid = (scalar @grid_array) ;
    #x axis major ticks and tailing ticks
    my $x_major = $xgrid / 7;
    my $tail_char = $xgrid % 7;
    #leader
    print "---------|";
    #major ticks
    print "---^---" x $x_major ;
    for (my $i = $tail_char; $i > 0 ; $i--){
	if ($i > 1 ){
	    print "-";
	}
	if ($i == 1){
	    print "^";
	}
    }
#get the mean value of major tick bins    
    print "\n";

    my @axis_array = ();
    for ( my $x = 0; $x < $xgrid ; $x++){
	my $xval = 0;
	#x-axis is printed in 7 character widths
	#values are the average of the middle gridpoint
	#so, 4,11,18, etc.
	if (($x - 4) % 7 == 0){
	    my @column = @{$grid_array[$x]};
	    my %box = %{$column[0]};
	    $xval = ($box{maxx} + $box{minx})/2;
	    # this array stores the values
	    push @axis_array, $xval;
	    
	    }
	}
    #avoids duplicate prints if xgrid = 7n +/- 5
    if ((($xgrid - 6) % 7) == 0){

	pop @axis_array;
    }

    #convert the float to a fixed width string
    # to keep exponent and sign
    # if number is in exponential notation,
    # reformat to two strings

    print"  PLOT   |";

    #now print as a fixed string to keep things lined up
    for (my $x = 0; $x < scalar(@axis_array); $x++){
	my $field = &format_number ($axis_array[$x],6);
	printf ("%6.6s|",$field);
    }
    #re-lookup the values for the last array (last element of array, still binned)
    my @column = @{$grid_array[$xgrid -1 ]};
    my %box = %{$column[0]};
    my $last_bin_value = ( $box{maxx} + $box{minx}) / 2;


    printf &format_number($last_bin_value,6);
    print "\n";
}

sub format_number {
    #returns a value with fixed width including 
    #signs and exponents
    #takes 2 arguments, value and total length
    #radix is always length -3 and exponent as length of 3
    my @args = @_;
    my $value = $args[0];
    my $length = $args[1];
    my $radix_length = $length - 3;
    #convert to floating point of total length
    $value = sprintf ("%.*g",$length , $value);
    my @strings = split /e/, $value;
    my $output = 0;
    if (scalar (@strings) > 1){
	my $radix = sprintf ("%.*s", $radix_length, $strings[0]);
	my $exp = sprintf ("%.3s", $strings[1]);
	$output = $radix.$exp;
    }
    else {
	$output = sprintf ("%.*s", $length, $strings[0]); 
	#remove pointless trailing point
	$output =~ s/\.$//;
    }
    return $output;
}

sub main {
    my @args = @ARGV;
    my $filename = $args[0];
    my $array_ref = &make_data_hash($filename);
    my $stat_hash_ref = &analyze_data($array_ref);
    my $num_bins = &calculate_bins ($stat_hash_ref->{points}, MAX_X_GRIDPTS );
    my $grid_array_ref = &create_grid($num_bins, YGRID, $stat_hash_ref);
    &bin_data ($array_ref, $grid_array_ref);
    print ":" x ( MAX_X_GRIDPTS + 10);
    print "\n";
    &grid_plot($grid_array_ref);
    &print_stats($stat_hash_ref);
    print ":" x ( MAX_X_GRIDPTS + 10);
    print "\n";
}

main();

exit;


