#!perl -w
use strict;

package Bigrams;

if ($0 eq 'Bigrams.pm') {
	if (@ARGV < 2) {
		die "usage: perl Bigrams.pm textfile sentencefile\n";
	}
	my $b = Bigrams->new($ARGV[0]);
	$b->check($ARGV[1]);
}

sub new {
	my $class = shift;
	my $self = {};
	bless $self, $class;
	my $textfile = shift;
	$self->initialize($textfile);
	return $self;
}

sub initialize {
	my $self = shift;
	my $file = shift;
	open F, $file or die "Usage: perl Bigrams.pm text-file sentence-file\n";
	my @lines = <F>;
	close F;
	my $text = join ' ', @lines;
	$text =~ s/\n+/ /g;
	$text =~ s/[\.\?!]+/\n/g;
	$text =~ s/[,:;"-]/ /g;
	$text =~ s/ +/ /g;
	$text =~ s/\n\s/\n/g;
	$text = lc $text;
	@lines = split /\n+/, $text;
	my ($total, %initial, %final, %unigrams, %bigrams, @text);
	foreach my $line (@lines) {
		my @words = split /\s/, $line;
		$bigrams{"<s>"}{$words[0]}++;
		$unigrams{"<s>"}++;
		push @text, "<s>";
		$bigrams{$words[$#words]}{"</s>"}++;
		$unigrams{"</s>"}++;
		for (my $i = 0; $i <= $#words; $i++) {
			$unigrams{$words[$i]}++;
			push @text, $words[$i];
			$total++;
			if ($i < $#words) {
				$bigrams{$words[$i]}{$words[$i+1]}++;
			}
		}
		push @text, "</s>";
	}
	foreach my $key1 (keys %bigrams) {
		my $theuni = $unigrams{$key1};
		foreach my $key2 (keys %{$bigrams{$key1}}) {
			$bigrams{$key1}{$key2} = $bigrams{$key1}{$key2} / $theuni;
		}
	}
	foreach my $key (keys %unigrams) {
		$unigrams{$key} = $unigrams{$key} / $total;
	}
	$self->{unigrams} = \%unigrams;
	$self->{bigrams} = \%bigrams;
	$self->{text} = \@text;
}

sub check {
	my $self = shift;
	my $sentencefile = shift;
	open G, $sentencefile or die "Usage: perl bigrams.pl text-file sentence-file\n";
	my @sentences = <G>;
	close G;
	my $unigrams = $self->{unigrams};
	my $bigrams = $self->{bigrams};
	foreach my $sentence (@sentences) {
		chomp $sentence;
		$sentence =~ s/[\.\?!]+//g;
		$sentence =~ s/[,:;]/ /g;
		$sentence =~ s/ +/ /g;
		$sentence = lc $sentence;
		my @words = split /\s/, $sentence;
		my $total = 1;
		my $initprob = $bigrams->{"<s>"}{$words[0]};
		if (!$initprob) { $initprob = 0; }
		$total *= $initprob;
		my $finalprob = $bigrams->{$words[$#words]}{"</s>"};
		if (!$finalprob) { $finalprob = 0; }
		$total *= $finalprob;
		for (my $i = 1; $i < @words; $i++) {
			my $currentbigram = $bigrams->{$words[$i-1]}{$words[$i]};
			if (!$currentbigram) { $currentbigram = 0; }
			$total *= $currentbigram;
		}
		print "$sentence\t$total\n";
	}
}

1;

=head1 NAME

Bigrams - I<Very> simple bigram language models in Perl

=head1 COMMAND-LINE SYNOPSIS

	perl Bigrams.pm textfile sentencefile

=head1 MODULE SYNOPSIS

	use Bigrams;

	$b = Bigrams->new(textfile);
	$b->check(sentencefile);

=head1 DESCRIPTION

This module can be called by other programs or in a stand-alone mode. The
program parses the F<textfile> into sentences, stripping all punctuation and
converting uppercase to lowercase. It then computes, in a sentence-by-sentence
fashion, all the bigrams in the text. The C<check()> function allows the user to
submit a file of sentences (one sentence per line) to the resulting bigram
language model. Calculated probabilities for the submitted senteces are printed
out.

The model does not handle out-of-vocabulary words or non-occurring bigrams.

=head1 AUTHOR

Michael Hammond, F<hammond@u.arizona.edu>

=cut

