|
| 1 | +#!/usr/bin/perl -w |
| 2 | +# baseline: compute a baseline classification for named entities |
| 3 | +# usage: baseline [-u] [nbr] train test |
| 4 | +# notes: option -u: only classify entities with unique class in train |
| 5 | +# method used: only tag phrases present in training data |
| 6 | +# greedy search: tag longest possible phrases |
| 7 | +# train and test are supposed to be in |
| 8 | +# CoNLL-2002 format |
| 9 | +# url: http://lcg-www.uia.ac.be/conll2002/ner/ |
| 10 | + |
| 11 | + |
| 12 | +use strict; |
| 13 | + |
| 14 | +my ( |
| 15 | + $i,$j,$k, |
| 16 | + $ambiguous,$bestCat,$bestCatNbr,$buffer,$bufferType,$debug, |
| 17 | + $key,$line,$onlyUniq,$tag,$test,$train,$type,$uniqNbr,$word, |
| 18 | + @classes,@test,@words, |
| 19 | + %hash, # hash of hashes for categories of word sequences |
| 20 | + %outWords # hash of words that appear outside of entities |
| 21 | +); |
| 22 | + |
| 23 | +$onlyUniq = 0; |
| 24 | +$uniqNbr = 0; |
| 25 | +$debug = 0; |
| 26 | +if (defined $ARGV[0] and $ARGV[0] eq "-d") { |
| 27 | + $debug = 1; |
| 28 | + shift(@ARGV); |
| 29 | +} |
| 30 | +if (defined $ARGV[0] and $ARGV[0] eq "-u") { |
| 31 | + $onlyUniq = 1; |
| 32 | + shift(@ARGV); |
| 33 | +} |
| 34 | +if (defined $ARGV[0] and $ARGV[0] =~ /^[0-9]+$/) { |
| 35 | + $uniqNbr = shift(@ARGV); |
| 36 | +} |
| 37 | +if ($#ARGV != 1) { die "usage: baseline [-u] [nbr] train test\n"; } |
| 38 | +$train = shift(@ARGV); |
| 39 | +$test = shift(@ARGV); |
| 40 | + |
| 41 | +# read train file |
| 42 | +$buffer = ""; |
| 43 | +$bufferType = ""; |
| 44 | +%hash = (); |
| 45 | +open(INFILE,$train); |
| 46 | +while (<INFILE>) { |
| 47 | + $line = $_; |
| 48 | + chomp($line); |
| 49 | + $line = "-X- O" if ($line =~ /^\s*$/); |
| 50 | + @words = split(/\s+/,$line); |
| 51 | + $word = shift(@words); # word is first item on line |
| 52 | + $tag = pop(@words); # tag is last item on line |
| 53 | + if ($tag eq "O") { $outWords{$word} = 1; } |
| 54 | + $type = $tag; |
| 55 | + $type =~ s/^.*-//; |
| 56 | + # if previous tagged phrase is complete |
| 57 | + if ($buffer and |
| 58 | + ($type eq "O" or $type ne $bufferType or $tag =~ /^B/)) { |
| 59 | + if (not defined $hash{$buffer}{$bufferType}) { |
| 60 | + $hash{$buffer}{$bufferType} = 1; |
| 61 | + } else { $hash{$buffer}{$bufferType}++; } |
| 62 | + @words = split(/\s+/,$buffer); |
| 63 | + pop(@words); |
| 64 | + # store all prefixes of entity in hash with tag PREFIX |
| 65 | + while (@words) { |
| 66 | + $line = join(" ",@words); |
| 67 | + if (not defined $hash{$line}{"PREFIX"}) { |
| 68 | + $hash{$line}{"PREFIX"} = 1; |
| 69 | + } else { $hash{$line}{"PREFIX"}++; } |
| 70 | + pop(@words); |
| 71 | + } |
| 72 | + $buffer = ""; |
| 73 | + $bufferType = ""; |
| 74 | + } |
| 75 | + # append current word to buffer if we are processing a tagged phrase |
| 76 | + if ($tag ne "O") { |
| 77 | + $buffer = $buffer ? "$buffer $word" : $word; |
| 78 | + $bufferType = $bufferType ? $bufferType : $type; |
| 79 | + } |
| 80 | +} |
| 81 | +if ($buffer) { |
| 82 | + if (not defined $hash{$buffer}{$bufferType}) { |
| 83 | + $hash{$buffer}{$bufferType} = 1; |
| 84 | + } else { $hash{$buffer}{$bufferType}++; } |
| 85 | + @words = split(/\s+/,$buffer); |
| 86 | + pop(@words); |
| 87 | + # store all prefixes of entity in hash with tag PREFIX |
| 88 | + while (@words) { |
| 89 | + $line = join(" ",@words); |
| 90 | + if (not defined $hash{$line}{"PREFIX"}) { |
| 91 | + $hash{$line}{"PREFIX"} = 1; |
| 92 | + } else { $hash{$line}{"PREFIX"}++; } |
| 93 | + pop(@words); |
| 94 | + } |
| 95 | +} |
| 96 | +close(INFILE); |
| 97 | + |
| 98 | +# read test file |
| 99 | +@test = (); |
| 100 | +open(INFILE,$test) or die "cannot open $test\n"; |
| 101 | +while (<INFILE>) { |
| 102 | + $line = $_; |
| 103 | + chomp($line); |
| 104 | + push(@test,$line); |
| 105 | +} |
| 106 | +close(INFILE); |
| 107 | + |
| 108 | +# assign entity tags to test file |
| 109 | +$i = 0; |
| 110 | +LOOP: while ($i<=$#test) { |
| 111 | + if (not $test[$i]) { print "\n"; $i++; next LOOP; } |
| 112 | + @words = split(/\s+/,$test[$i]); |
| 113 | + if (not defined %{$hash{$words[0]}}) { |
| 114 | + print "$test[$i] O\n"; |
| 115 | + $i++; |
| 116 | + } else { |
| 117 | + $j = 0; |
| 118 | + $buffer = "$words[0]"; |
| 119 | + # add words to phrase while we are in a phrase prefix and |
| 120 | + # the next word exists and is not a line break |
| 121 | + while (defined $hash{$buffer}{"PREFIX"} and |
| 122 | + $i+$j < $#test and $test[$i+$j+1]) { |
| 123 | + $j++; |
| 124 | + @words = split(/\s+/,$test[$i+$j]); |
| 125 | + $buffer .= " $words[0]"; |
| 126 | + } |
| 127 | + # remove words from entity |
| 128 | + @classes = defined $hash{$buffer} ? %{$hash{$buffer}}: (); |
| 129 | + # note: classes always contains pairs tag/amount |
| 130 | + # remove words from phrase while current phrase is nonempty and |
| 131 | + # does not contain a phrase or is only a prefix |
| 132 | + while ($buffer and |
| 133 | + ($#classes < 0 or |
| 134 | + ($#classes == 1 and defined $hash{$buffer}{"PREFIX"})) or |
| 135 | + ($onlyUniq and |
| 136 | + ($#classes > 3 or |
| 137 | + ($#classes > 1 and not defined $hash{$buffer}{"PREFIX"})))) { |
| 138 | + $j--; |
| 139 | + @words = split(/\s+/,$buffer); |
| 140 | + pop(@words); |
| 141 | + $buffer = join(" ",@words); |
| 142 | + @classes = defined $hash{$buffer} ? %{$hash{$buffer}}: (); |
| 143 | + } |
| 144 | + if ($debug) { |
| 145 | + # show phrase with possible classification and nbr of examples |
| 146 | + print ">>> $#classes $buffer "; |
| 147 | + foreach $i (@classes) { print "# $i "; } |
| 148 | + print "\n"; |
| 149 | + } |
| 150 | + # if no complete entity was found |
| 151 | + if (not $buffer) { |
| 152 | + print "$test[$i] O\n"; |
| 153 | + $i++; |
| 154 | + next LOOP; |
| 155 | + } |
| 156 | + # get category |
| 157 | + $bestCat = "UNDEF"; |
| 158 | + $bestCatNbr = 0; |
| 159 | + foreach $key (sort keys %{$hash{$buffer}}) { |
| 160 | + if ($key ne "PREFIX" and $hash{$buffer}{$key} > $bestCatNbr) { |
| 161 | + $bestCatNbr = $hash{$buffer}{$key}; |
| 162 | + $bestCat = $key; |
| 163 | + } |
| 164 | + } |
| 165 | + # does the phrase occur frequently enough in the training data? |
| 166 | + if ($bestCatNbr < $uniqNbr) { |
| 167 | + print "$test[$i] O\n"; |
| 168 | + $i++; |
| 169 | + next LOOP; |
| 170 | + } |
| 171 | + for ($k=$i;$k<=$i+$j;$k++) { |
| 172 | + if ($k == $i) { print "$test[$k] B-$bestCat\n"; } |
| 173 | + else { print "$test[$k] I-$bestCat\n"; } |
| 174 | + } |
| 175 | + $i += $j+1; |
| 176 | + } |
| 177 | +} |
| 178 | + |
| 179 | +exit(0); |
0 commit comments