#!/usr/cs/bin/perl
#
# File: get_fandw.pl
#
# Purpose: Generate global (gloss) vocab matrix from source F and W files
#
# Modifications:
#   19-JAN-1998  txe  Initial creation
#   21-JAN-1998  txe  Overhauled to run faster, uses axes files
#   17-FEB-1998  txe  Updated to use GetArgument
#   18-FEB-1998  txe  Changed require to subs directory
#   11-APR-1998  txe  Now uses /pie/ instead of /tmp
#   30-APR-1998  txe  Automatically creates axes.txt if not there
#

  require "subs/ir_subs.pm";

  $data_dir = GetArgument (0, "data directory", "test"); 

  if ($data_dir eq "test" || $data_dir eq "test2") {
    $tree = "test/f_and_w/*";		
  }
  else {
    $tree = "/pie/gloss_data/*/*/*";
  }

  $f_file    = "$data_dir/f.txt";
  $w_file    = "$data_dir/w.txt";
  $axes_file = "$data_dir/axes.txt";

# $axes_file = $ARGV[0];

  if (-e $axes_file) {
    &LoadAxes ($axes_file);
  }
  else {
    if ($axes_file ne "") {
      print "axes_file '$axes_file' does not exist.  Create it?\n";
print "yes!!!!!!!!!!!\n";
#      $ch = substr (<STDIN>, 0, 1);
#      if ($ch ne "y" && $ch ne "Y") {
#        $axes_file = "";      
#      }
    }
    while ($axes_file eq "") {
      print "CTRL-C & use axes_file as 1st arg, or enter axes_file to create:\n";
      $axes_file = <STDIN>;
      chop ($axes_file);
    }
    &CalculateAxes ($tree, "F.txt");
    &SaveAxes ($axes_file);
  }

  &GetFandW ($tree, "F.txt", $f_file);
  &GetFandW ($tree, "W.txt", $w_file);
  print "Done.\n";

##############################################################################

sub CalculateAxes {
  local ($tree, $in_name) = @_;
   
  %unsorted_terms    = ();
  %unsorted_coll_ids = ();
 
  print "Reading vocab from tree '$tree'...\n";

  while ($in_file = <$tree/$in_name>) {
    print "  Reading terms from '$in_file'...\n";

    $coll_id = $in_file;
    $coll_id =~ s#/$in_name##;
    $coll_id =~ s#.*/##;

    $unsorted_coll_ids{$coll_id} = 1;

    open (IN_FP, "$in_file") || die ("Error opening in_file '$in_file'\n");
    while ($line = <IN_FP>) {
      ($term, $number) = split (/\s+/, $line);
      $unsorted_terms{$term} = 1;
    }
    close (IN_FP); 

    delete $unsorted_terms{""};
  }

  print "  Sorting terms...\n";
  @terms = sort (keys (%unsorted_terms));
  print "  Sorting coll_ids...\n";
  @coll_ids = sort (keys (%unsorted_coll_ids));
}

##############################################################################

sub SaveAxes {
  local ($axes_file) = @_;

  $num_terms = @terms;
  $num_coll_ids = @coll_ids;

  print "Saving $num_terms terms and $num_coll_ids coll_ids to '$axes_file'...\n";

  open (AXES_FP, ">$axes_file") || die ("Error opening axes_file '$axes_file'\n");

  foreach (@terms) {
    print AXES_FP "$_\n";
  }
  print AXES_FP "\n";
  foreach (@coll_ids) {
    print AXES_FP "$_\n";
  }

  close (AXES_FP);  
}

##############################################################################

sub LoadAxes {
  local ($axes_file) = @_;

  print "Loading axes from '$axes_file'...\n";

  @terms    = ();
  @coll_ids = ();

  open (AXES_FP, "$axes_file") || die ("Error opening axes_file '$axes_file'\n");

  for ($i = 0; ($line = <AXES_FP>) && ($line ne "\n"); $i++) {
    chop ($line);
    $terms[$i] = $line;
  }
  for ($j = 0; ($line = <AXES_FP>) && ($line ne "\n"); $j++) {
    chop ($line);
    $coll_ids[$j] = $line;
  }

  close (AXES_FP);  

  print "Read $i terms and $j coll_ids from '$axes_file'\n";
}

##############################################################################

sub GetFandW {
  local ($tree, $in_name, $out_file) = @_;
   
  print "Reading values from tree '$tree'...\n";

  print "  Printing terms axis...\n";

  open (OUT_FP, ">$out_file") || die ("Error opening out_file '$out_file'\n");
  print OUT_FP "TERMS:\t";
  foreach (@terms) {
    print OUT_FP "$_\t";
  }
  print OUT_FP "\n";
  close (OUT_FP);

  while ($in_file = <$tree/$in_name>) {
    $coll_id = $in_file;
    $coll_id =~ s#/$in_name##;
    $coll_id =~ s#.*/##;

    %numbers = ();
    print "  Reading numbers for coll_id '$coll_id'...\n";

    open (IN_FP, "$in_file") || die ("Error opening in_file '$in_file'\n");
    while ($line = <IN_FP>) {
      ($term, $number) = split (/\s+/, $line);
      $numbers{$term} = $number;
    }
    close (IN_FP); 

    print "  Writing numbers for coll_id '$coll_id' to '$out_file'...\n";

    open (OUT_FP, ">>$out_file") || die ("Error opening out_file '$out_file'\n");
    print OUT_FP "$coll_id\t";
    foreach (@terms) {
      if ($numbers{$_} eq "") {
        $numbers{$_} = 0;
      }
      print OUT_FP "$numbers{$_}\t";  
    }
    print OUT_FP "\n";
    close (OUT_FP);
  }
}
