#!perl -w
use strict;

package Unigrams;

if ($0 eq 'Unigrams.pm') {
	if (@ARGV < 2) {
		die "usage: perl Unigrams.pm textfile sentencefile\n";
	}
	my $b = Unigrams->new($ARGV[0]);
	$b->check($ARGV[1]);
}

sub new {
	my $class = shift;
	my $self = {};
	bless $self, $class;
	my $textfile = shift;
	$self->initialize($textfile);
	return $self;
}

sub initialize {
	my $self = shift;
	my $file = shift;
	open F, $file or die "Usage: perl unigrams.pl text-file sentence-file\n";
	my @lines = <F>;
	close F;
	my $text = join ' ', @lines;
	$text =~ s/\n+/ /g;
	$text =~ s/[\.\?!]+/\n/g;
	$text =~ s/[,:;]/ /g;
	$text =~ s/ +/ /g;
	$text =~ s/\n\s/\n/g;
	$text = lc $text;
	@lines = split /\n/, $text;
	my %unigrams;
	my $total;
	foreach my $line (@lines) {
		my @words = split /\s/, $line;
		for (my $i = 0; $i <= $#words; $i++) {
			$unigrams{$words[$i]}++;
			$total++;
		}
	}
	foreach my $key (keys %unigrams) {
		$unigrams{$key} = $unigrams{$key} / $total;
	}
	$self->{unigrams} = \%unigrams;
}

sub check {
	my $self = shift;
	my $sentencefile = shift;
	open G, $sentencefile or die "Usage: perl unigrams.pl text-file sentence-file\n";
	my @sentences = <G>;
	close G;
	my $unigrams = $self->{unigrams};
	foreach my $sentence (@sentences) {
		chomp $sentence;
		$sentence =~ s/[\.\?!]+//g;
		$sentence =~ s/[,:;]/ /g;
		$sentence =~ s/ +/ /g;
		$sentence = lc $sentence;
		my @words = split /\s/, $sentence;
		my $total = 1;
		for (my $i = 0; $i < @words; $i++) {
			my $currentunigram = $unigrams->{$words[$i]};
			if (!$currentunigram) {
				$total = 0;
			} else {
				$total *= $currentunigram;
			}
		}
		print "$sentence\t$total\n";
	}
}

1;

=head1 NAME

Unigrams - I<Very> simple unigram language models in Perl

=head1 COMMAND-LINE SYNOPSIS

	perl Unigrams.pm textfile sentencefile

=head1 MODULE SYNOPSIS

	use Unigrams;

	$b = Unigrams->new(textfile);
	$b->check(sentencefile);

=head1 DESCRIPTION

This module can be called by other programs or in a stand-alone mode.
The program parses the F<textfile> into sentences, stripping all
punctuation and converting uppercase to lowercase. It then computes,
in a sentence-by-sentence fashion, all the unigrams in the text. The
C<check()> function allows the user to submit a file of sentences (one
sentence per line) to the resulting unigram language model. Calculated
probabilities for the submitted sentences are printed out.

The model does not handle out-of-vocabulary words.

=head1 AUTHOR

Michael Hammond, F<hammond@u.arizona.edu>

=cut

