#!/usr/bin/perl ##################################### # Sub-routine get_files($) # # # # gets a list of files that provide # # the data for the pcfg # ##################################### sub get_files($){ my $dir = shift; my @listing = glob "$dir/*"; if (@listing > 0){ foreach $file(@listing){ if(-f $file){ #only add .prd files to list if ($file =~ /\Q${suffix}\E$/){ push @files, $file; } } elsif (-d $file){ $temp = $file; $temp =~ /\//; $command = "mkdir new\/$'"; system $command; get_files($file); } else { #not interested in directory element } } } return @files; } ################################## # Sub-Routine add_to($,$) # # # # adds a rule to the lexicon # # or list of rules or # # increments its frequency count # ################################## sub add_to ($,$){ my $hash = shift; my $rule = shift; $rule =~ s/\-LRB\-/\(/g; $rule =~ s/\-RRB\-/\)/g; $rule =~ s/\-LCB\-/\{/g; $rule =~ s/\-RCB\-/\}/g; if ($hash eq "lexicon") { $num_words ++; } $rule =~ s/\s+/ /g; if (exists $$hash{$rule}){ $$hash{$rule}++; if ($hash eq "rules") { $rule =~ /\-> /; $rhs = $'; @cats = split / +/, $rhs; $num = @cats; } } else{ $$hash{$rule} = 1; if ($hash eq "rules") { #print "Adding: $rule\n"; $rule =~ /\-> /; $rhs = $'; @cats = split / / , $rhs; $num = @cats; } } } ##################################### # Sub-Routine clean($) # # # # takes out all punctuation and # # null elements # ##################################### sub clean($){ my $string = shift; # $string =~ s/[\`\?\!\"\;\:\,]|\'\'//g; $string =~ s/\s+/ /g; if ($remove_func) { if ($string =~ s/\(([A-Z]+)(\-[A-Z0-9]+)+(=[0-9]+)?(\-[0-9]+)? /\($1 /g) { #reduces all categories to their base forms, e.g. NP-SBJ-1 --> NP print LOGS "Reduced category to base form: $&\n"; } } $string =~ s/\*RCB\*/\-RCB\-/g; $string =~ s/\*LCB\*/\-LCB\-/g; $string =~ s/\*RRB\*/\-RRB\-/g; $string =~ s/\*LRB\*/\-LRB\-/g; if ($string =~ s/\*[A-Z]+\*(-[0-9]+)?//g) { print LOGS "Removed Empty Element : $&\n"; } if ($string =~ s/\*-[0-9]+//g) { print LOGS "Removed Index : $&\n"; } if ($string =~ s/[A-Z]+\*//g) { print LOGS "Removed Empty Element : $&\n"; } if ($string =~ s/\([A-Z]+ \*\)//g) { print LOGS "Removed Empty Element : $&\n"; } if ($string =~ s/\([A-Z\-]+ \*\?\*\)//g) { print LOGS "Removed Empty Element : $&\n"; } if ($string =~ s/\*\?\*//g){ print LOGS "Removed Empty Element : $&\n"; } if ($string =~ s/\([A-Z]+ \)//g){ print LOGS "Removed Empty Element : $&\n"; } return $string; } ##################################### # Sub-Routine get_probs($) # # # # assigns probabilities to each # # rule that was found # ##################################### sub get_probs($){ my $hash = shift; %totals = (); %probabilities = (); foreach $key (keys(%$hash)){ $key =~ /\-\>/; my $category = $` ; if(exists $totals{$category}){ $totals{$category}+=$$hash{$key}; } else { $totals{$category} = $$hash{$key}; } } foreach $key (keys(%$hash)){ $key =~ /\-\>/; my $category = $`; $prob = $$hash{$key} / $totals{$category}; $probabilities{$key} = log $prob; } } return 1;