#!perl -w use strict; use Getopt::Std; my (%words, %opts, $tokens, $hapax); getopts 'p', \%opts; #read from the file line by line open F, $ARGV[0] or die "usage: perl hapax.pl (-p) filename\n"; my @lines = ; close F; #break the lines into words and build the concordance foreach my $line (@lines) { chomp $line; $line = lc $line; my @words = split /[^a-z]+/, $line; foreach my $word (@words) { $words{$word}++; $tokens++; } } #find the words that only occur once foreach my $word (sort keys %words) { if ($words{$word} == 1) { print "$word\n" if ($opts{p}); $hapax++; } } #print summary stats print "Hapax legomena: $hapax\n"; print "Word types: ", scalar keys %words, "\n"; print "Word tokens: $tokens\n"; =head1 NAME hapax.pl - Counts hapax legomena in a text =head1 SYNOPSIS perl hapax.pl (-p) textfile =head1 DESCRIPTION Finds the number of words in a text, the number of distinct words, and the subset of those distinct words that only occur once (B). The C<-p> flag prints those items to standard output. =head1 AUTHOR Michael Hammond, F =cut