#!perl -w use strict; sub log2 { my $n = shift; return (log($n)/log(2)); } #read from training text open F, $ARGV[0] or die "usage: perl addonecross.pl training-file test-file\n"; my @lines1 = ; close F; my (%words1, $total1, @text1); #tokenize amd make counts for training text foreach my $line (@lines1) { chomp $line; my @words = split /[^a-zA-Z]+/, $line; foreach my $word (@words) { $words1{$word}++; $total1++; push @text1, $word; } } #add one to counts for training text foreach my $word (keys %words1) { $words1{$word}++; $total1++; } #read words in test text open F, $ARGV[1] or die "usage: perl entropy.pl training-file test-file\n"; my @lines = ; close F; #collect wordsin test text my @text; foreach my $line (@lines) { chomp $line; my @words = split /[^a-zA-Z]+/, $line; push @text, @words; } #add one for words in test text not in training text foreach my $word (@text) { if (!$words1{$word}) { $words1{$word} = 1; $total1++; } } #compute cross entropy my $entropy; my $prob; foreach my $word (@text) { if ($words1{$word}) { $prob = $words1{$word} / $total1; } else { $prob = 0; } $entropy += log2($prob); } $entropy *= -1; $entropy = $entropy/(scalar @text); print "entropy: $entropy\n"; print "perplexity: ", 2**$entropy, "\n"; =head1 NAME addonecross.pl - Calculates the cross-entropy of a text using add-one smoothing =head1 SYNOPSIS perl addonecross.pl textfile1 textfile2 =head1 DESCRIPTION This program calculates the cross-entropy of a text based on unigram probabilities, using add-one smoothing. The training text and test text are given in ASCII files specified on the command-line. Punctuation is stripped, but capitalized words are treated as distinct. =head1 AUTHOR Michael Hammond, F =cut