#!perl -w use strict; use Getopt::Std; sub makeconcordance { my $f = shift; my (%words, $tokens); #read from the file line by line open F, $f or die "usage: perl novelwords.pl (-p) filename1 filename2\n"; my @lines = ; close F; #break the lines into words and build the concordance foreach my $line (@lines) { chomp $line; $line = lc $line; my @words = split /[^a-z]+/, $line; foreach my $word (@words) { $words{$word}++; $tokens++; } } return (\%words, $tokens); } my ($words1, $words2, $tok1, $tok2, %opts); my $new = 0; getopts 'p', \%opts; ($words1, $tok1) = makeconcordance $ARGV[0]; ($words2, $tok2) = makeconcordance $ARGV[1]; #find the words that only occur once foreach my $word (sort keys %$words2) { if (!$words1->{$word}) { print "$word\n" if ($opts{p}); $new++; } } #print summary stats print STDERR "Word types in first text: ", scalar keys %$words1, "\n"; print STDERR "Word tokens in first text: $tok1\n"; print STDERR "Word types in second text: ", scalar keys %$words2, "\n"; print STDERR "Word tokens in second text: $tok2\n"; print STDERR "New word types in second text: $new\n"; =head1 NAME novelwords.pl - Counts novel words in a text =head1 SYNOPSIS perl novelwords.pl (-p) textfile1 textfile2 =head1 DESCRIPTION Finds the number of words and the number of distinct words in two texts. Returns the number of words that occur in the second text, but not the first. The C<-p> flag prints those items to standard output. =head1 AUTHOR Michael Hammond, F =cut