# afterTag.prl
# cajoles output from treeTagger into proper TEI XML
#
$sno = 1;
while (<>) {
chop;
s//XXXX/;
if (/<!--) { # tag found<br /--> $buffer .= $_;
} else { # token found
($str, $pos, $hw) = split(/\t/);
$tag = "w";
$pos =~ s/\$/s/;
unless ($pos =~ /[A-Z][A-Z]/) {
$hw ="";
$tag = "pc";
$pos = "sub" if($pos =~ /[\,\:\;]/);
$pos = "op" if($pos =~ /[\(\[\{]/);
$pos = "cp" if($pos =~ /[\)\]\}]/);
$pos = "oq" if($pos =~ /[\`]/);
$pos = "cq" if($pos =~ /[\"\']/);
}</code>
if ($pos=~ /SENT/) {
$tag = "pc";
$hw ="";
} elsif ($hw =~ /[\[\]]/) {
$tag = "pc";
$hw ="";
}
$str =~ s/\&/\&\;/g;
$hw =~ s/\&/and/g;
if ($hw) {$hw = " lemma\=\"$hw\""};
$buffer .= "<$tag type\=\"$pos\"$hw>$str<\/$tag>\n";
}
} # end of file
$buffer =~ s/]*)>//g;
$buffer =~ s/<\/head>/<\/s><\/head>/g;
$buffer =~ s/
]+)>/
/g;
$buffer =~ s/
/
/g;
#$buffer =~ s/
/
/g;
$buffer =~ s/<\/p>/<\/s><\/p>/g;
$buffer =~ s/]+)>//g;
$buffer =~ s/<\/ab>/<\/s><\/ab>/g;
$buffer =~ s/(type="SENT"[^\/]+\/pc>\n)<\/s>\n
print $buffer;
sub sentenceStart {
#print "" ;
$sentenceStarted = 1;
}
sub sentenceStop {
my $notYet;
foreach $openTag (keys %tagStash) {
if($tagStash{$openTag} > 0)
{ print"<!-- .. still got an open $openTag -->";
$notYet = "1";
}
}
unless ($notYet) {
# print "<\/s>\n";
print "\n";
$sentenceStarted = $sentenceEnded = 0;
}
}
afterTag
Posted in
1 Comment
[...] in a nice clean TEI conformant version, but somehow it’s always quicker to just run an after-the-event perl script to tidy up its output. Which gave me a bunch of files that contained lines like this <div [...]