#!/usr/bin/perl5
#
# File: testmap.pl
#
# Purpose: Verifies counts of doc_ids in the mapping file, the source
#   files, and a pre-generated frequency list.  Run this after running
#   buildmap.pl to make sure buildmap.pl worked correctly.
#
# Modifications:
#   13-JAN-1997  txe  Initial creation
#

&TestMap ("coll_map.txt", "/pie/trec", "testmap.good", "testmap.out");

sub TestMap {
  local ($map_file, $root_dir, $cnt_file, $out_file) = @_;

  unlink ($out_file);

  &ReadGoodCounts ($cnt_file);
  $error_count = 0;

  while ($coll_file = <${root_dir}/*/*/*>) {
    print "Counting number of doc_ids in collection $coll_file...\n";

    $coll_id = $coll_file;
    $coll_id =~ s#.*/##;

    $num_tags = CountMatches ($coll_file, "<DOCNO>");
    $num_ids  = CountMatches ($map_file,  $coll_id);

    open (OUT_FP, ">>$out_file") || die ("Error opening out_file '$out_file'\n");

    if (($num_tags eq $num_ids) && ($num_tags eq $num_goods{$coll_id})) {
      print "  $num_tags matches, verified.\n";
      print OUT_FP "$coll_id $num_tags\n";
    }
    else {
      $all_three = "$num_tags/$num_ids/$num_goods{$coll_id}";
      print "  WARNING: mismatch!  num_tags/num_ids/num_good = $all_three\n";
      print OUT_FP "$coll_id $all_three <-- mismatch\n";
      $error_count++;
    }
  }

  print "Completed with $error_count errors.\n";
}

sub ReadGoodCounts {
  local ($cnt_file) = @_;

  open (FP, $cnt_file) || die ("Error opening cnt_file '$cnt_file'\n");

  while ($line = <FP>) {
    ($coll_id, $num_good) = split (/\s+/, $line);
    $num_goods{$coll_id} = $num_good;
  }

  close (FP);  
}

sub CountMatches {
  local ($in_file, $search_string) = @_;

  open (FP, $in_file) || die ("Error opening in_file '$in_file'\n");

  $search_string =~s/\./\\\./g;
  $matches = 0;

  while ($line = <FP>) {
    $_ = $line;
    if (/$search_string/) {  
      $matches++;
    }
  }
  close (FP);

  $matches;
}
