#!perl -w use strict; #subroutine to calculate base-2 logarithms sub log2 { my $n = shift; return (log($n)/log(2)); } #check the number of command-line arguments if (@ARGV < 3) { die "usage: perl addx.pl x training-file test-file\n"; } #assign x my $x = $ARGV[0]; #read from training text open F, $ARGV[1] or die "can't open $ARGV[1]\n"; my @lines1 = ; close F; my (%words1, $total1, @text1); #tokenize amd make counts for training text foreach my $line (@lines1) { chomp $line; my @words = split /[^a-zA-Z]+/, $line; foreach my $word (@words) { $words1{$word}++; $total1++; push @text1, $word; } } #add one to counts for training text foreach my $word (keys %words1) { $words1{$word} += $x; $total1 += $x; } #read words in test text open F, $ARGV[2] or die "can't open $ARGV[2]\n"; my @lines = ; close F; #collect words in test text my @text; foreach my $line (@lines) { chomp $line; my @words = split /[^a-zA-Z]+/, $line; push @text, @words; } #add one for words in test text not in training text foreach my $word (@text) { if (!$words1{$word}) { $words1{$word} = $x; $total1 += $x; } } #compute cross entropy my $entropy; my $prob; foreach my $word (@text) { if ($words1{$word}) { $prob = $words1{$word} / $total1; } else { $prob = 0; } $entropy += log2($prob); } $entropy *= -1; $entropy = $entropy/(scalar @text); print "entropy: $entropy\n"; print "perplexity: ", 2**$entropy, "\n"; =head1 NAME addx.pl - Calculates the cross-entropy of a text using add-X smoothing =head1 SYNOPSIS perl addx.pl x textfile1 textfile2 =head1 DESCRIPTION This program calculates the cross-entropy of a text based on unigram probabilities, using B smoothing. This allows one to test alternatives to B smoothing, where one can vary how much is added to each unattested unigram. The training text and test text are given in ASCII files specified on the command-line. The value of x is given as the first command-line argument. Punctuation is stripped, and capitalized words are treated as lowercase. =head1 AUTHOR Michael Hammond, F =cut