#!/usr/bin/perl -w ##################################################### # This program converts any PCFG into a PCFG in # # Chomsky Normal Form # # # # usage: perl convert.pl rules_file # # output: a file "cnf_grammar" # # Aoife Cahill # # April 2001 # ##################################################### $r_file = shift; %rules = (); #read in the rules open(RULES,$r_file) ; while() { chomp; #rules are separated from their probabilities by two tabs $_ =~ /\t\t/; $rule = $`; #what's before the tabs $prob = $'; #what's after the tabs $rules{$rule} = $prob; } close(RULES) ; $count = 0; %new_rules = (); %non_ts = (); %found = (); $deleted = 0; foreach $rule (sort keys %rules) { $prob = $rules{$rule}; if ($rule =~ /\-> /) { $categories = $'; $head = $`; if (!exists $non_ts{$head}) { $non_ts{$head} = 1; } @cats = split / / , $categories; $num = @cats; @t_rules = (); if ($num > 2) { $new_rule = "$head-> $cats[0] R$count"; push @t_rules, $new_rule; for ($i = 1; $i < $num -2 ; $i++) { $head = "R$count"; if (!exists $non_ts{$head}) { $non_ts{$head} = 1; } $count ++; $new_rule = "$head-> $cats[$i] R$count"; push @t_rules, $new_rule; } $prev_rule = $new_rule; #last rule... $head = "R$count"; $count++; $new_rule = "$head-> $cats[$num-2] $cats[$num-1]"; push @t_rules,$new_rule; $num_t = @t_rules; for ($i = $num_t;$i> 1 ; $i--){ $new_rule = pop @t_rules; $new_rule =~ /\-> /; $new_rule = $'; $head = $`; if (! exists $found{$new_rule}){ if (!exists $non_ts{$head}) { $non_ts{$head} = 1; } $found{$new_rule} = $head; $new_rule = "$head-> $new_rule"; $new_rules{$new_rule} = 0; } else { $count--; $deleted ++; $prev_rule = pop @t_rules; $prev_rule =~ s/R$count/$found{$new_rule}/; push @t_rules, $prev_rule; } } $last_rule = pop @t_rules; $new_rules{$last_rule} = $prob; } else { $new_rules{$rule} = $prob; } } } print "Added $count extra rules\n"; print "Saved $deleted rules\n"; open (NEW_RULES, ">cnf_grammar") or die "Error: $!"; $total_rules = 0; foreach $rule (sort keys %new_rules) { $total_rules++; print NEW_RULES "$rule\t\t$new_rules{$rule}\n"; } close(NEW_RULES); print "Total Rules: $total_rules\n";