#!/usr/bin/perl  

#####################################
# Sub-routine get_files($)          #
#                                   #
# gets a list of files that provide # 
# the data for the pcfg             #
#####################################

sub get_files($){

	my $dir = shift;
	my @listing = glob "$dir/*";

	if (@listing > 0){
		foreach $file(@listing){
			if(-f $file){
				#only add .prd files to list
				if ($file =~ /\Q${suffix}\E$/){
					push @files, $file;
				}
			}
			elsif (-d $file){
				$temp = $file;
				$temp =~ /\//;
				$command = "mkdir new\/$'";
				system $command;
				get_files($file);
			}
			else {
				#not interested in directory element
			}
		}
	}
	return @files;
}

##################################
# Sub-Routine add_to($,$)        #
#                                #
# adds a rule to the lexicon     #
# or list of rules or            #
# increments its frequency count #
##################################

sub add_to ($,$){
	
	my $hash = shift;
	my $rule = shift;

    $rule =~ s/\-LRB\-/\(/g;
    $rule =~ s/\-RRB\-/\)/g;
    $rule =~ s/\-LCB\-/\{/g;
    $rule =~ s/\-RCB\-/\}/g;

	if ($hash eq "lexicon") {
		$num_words ++;
	}

	$rule =~ s/\s+/ /g;

	if (exists $$hash{$rule}){
		$$hash{$rule}++;
	
		if ($hash eq "rules") {
			$rule =~ /\-> /;
			$rhs = $';
			@cats = split / +/, $rhs;
			$num = @cats;
		}
	}
	else{
		$$hash{$rule} = 1;
		if ($hash eq "rules") {
			#print "Adding: $rule\n";
			$rule =~ /\-> /;
			$rhs = $';
			@cats = split / / , $rhs;
			$num = @cats;
		}
	}
}

#####################################
# Sub-Routine clean($)              #
#                                   #
# takes out all punctuation and     #
# null elements                     #
#####################################
sub clean($){
	my $string = shift;
	
#	$string =~ s/[\`\?\!\"\;\:\,]|\'\'//g;
	$string =~ s/\s+/ /g;
	if ($remove_func) {
		if ($string =~ s/\(([A-Z]+)(\-[A-Z0-9]+)+(=[0-9]+)?(\-[0-9]+)? /\($1 /g) {
			#reduces all categories to their base forms, e.g. NP-SBJ-1 --> NP
			print LOGS "Reduced category to base form: $&\n";
		}
	}
	$string =~ s/\*RCB\*/\-RCB\-/g;
	$string =~ s/\*LCB\*/\-LCB\-/g;
	$string =~ s/\*RRB\*/\-RRB\-/g;
    $string =~ s/\*LRB\*/\-LRB\-/g;
	if ($string =~ s/\*[A-Z]+\*(-[0-9]+)?//g) {
		print LOGS "Removed Empty Element : $&\n";
	}
	if ($string =~ s/\*-[0-9]+//g) {
		print LOGS "Removed Index : $&\n";
	}
	if ($string =~ s/[A-Z]+\*//g) {
		print LOGS "Removed Empty Element : $&\n";
	}
	if ($string =~ s/\([A-Z]+ \*\)//g) {
		print LOGS "Removed Empty Element : $&\n";
	}
	if ($string =~ s/\([A-Z\-]+ \*\?\*\)//g) {
		print LOGS "Removed Empty Element : $&\n";
	}
	if ($string =~ s/\*\?\*//g){	
		print LOGS "Removed Empty Element : $&\n";
	}
	if ($string =~ s/\([A-Z]+ \)//g){
		print LOGS "Removed Empty Element : $&\n";
	}

	return $string;
}

#####################################
# Sub-Routine get_probs($)          #
#                                   #
# assigns probabilities to each     #
# rule that was found               #
#####################################

sub get_probs($){

	my $hash = shift;
	%totals = ();
	%probabilities = ();

	foreach $key (keys(%$hash)){
		$key =~ /\-\>/;
		my $category = $` ;
		if(exists $totals{$category}){
			$totals{$category}+=$$hash{$key};
		}
		else {
			$totals{$category} = $$hash{$key};
		}
	}

	foreach $key (keys(%$hash)){
		$key =~ /\-\>/;
		my $category = $`;
		$prob = $$hash{$key} / $totals{$category};
		$probabilities{$key} = log $prob;
	}
}
return 1;