#!perl -w use strict; sub log2 { my $n = shift; return (log($n)/ log(2)); } if (@ARGV < 1) { die "usage: perl entropy2a.pl filename\n"; } open F, $ARGV[0] or die "can't open file.\n"; my @lines = ; close F; my @words; foreach my $line (@lines) { $line = lc $line; $line =~ s/[^a-z]+/ /g; my @thesewords = split / +/, $line; push @words, @thesewords; } my (%unigrams, %bigrams); my $total = scalar @words; my $first = $words[0]; $unigrams{$first}++; for (my $i = 1; $i <= $#words; $i++) { my $second = $words[$i]; $bigrams{$first}{$second}++; $first = $second; $unigrams{$first}++; } my $entropy; my $last = shift @words; while (@words) { my $second = shift @words; my $prob = $bigrams{$last}{$second} / $unigrams{$last}; $entropy += log2($prob); $last = $second; } $entropy *= -1; $entropy = $entropy/$total; print "entropy: $entropy\n"; print "perplexity: ", 2**$entropy, "\n"; =head1 NAME entropy2.pl - Calculates the per-word entropy of a text using bigrams =head1 SYNOPSIS perl entropy2.pl textfile =head1 DESCRIPTION This program calculates the per-word entropy of a text based on bigram probabilities. The text is given in an ascii file specified on the command-line. Punctuation is stripped, and capitalization is removed. =head1 AUTHOR Michael Hammond, F =cut