Commit 7bd79bf5 authored by Indrek Jentson's avatar Indrek Jentson

Lisatud uuendatud perl'i skriptid

parent 2a462055
# DOCKERi skript KL-lausestaja ehitamiseks
FROM debian:buster
FROM debian:oldstable
# Kui Debianil pole vaikimisi kaasas GCC vajalikke teeke vms, siis tuleb need ka lasta installida.
......@@ -34,6 +34,9 @@ RUN apt-get -y install perl && \
COPY config_dist.js /wrapper/config.js
COPY command.sh /wrapper
COPY EstCGFix/json2mrf.pl /wrapper/EstCG/json2mrf.pl
COPY EstCGFix/rtolkija.pl /wrapper/EstCG/rtolkija.pl
COPY EstCGFix/rlausestaja.pl /wrapper/EstCG/rlausestaja.pl
# Tarkvara X paigaldamine lõppeb
......
#!/usr/bin/perl
use utf8;
use open qw(:std :utf8);
my $s = 0;
my $word = "";
my $read = "";
my $reads = "";
my $ending = "";
my $form = "";
my $pos = "";
my $lemma = "";
while(<>){
#s/[\\]n/\n/g; #vabamorfi bugi
chomp;
if (/^\t\t\t\t\{/){ # sentence
$s = 1;
print "<s>\n";
}
if (/^\t\t\t\t\}/){ # sentence
$s = 0;
print "</s>\n";
}
if ($s == 1) {
if (/^\t\t\t\t\t\t\t"text":/) { # 7 "text": "Kes"
s/^\t\t\t\t\t\t\t"text": "(.*)"$/$1/g;
$word = $_;
if ($reads eq "") {
if ($word eq '\"' or $word eq '\\"' or $word eq '\\' or $word eq '\\\\"') {
print "\"\n \"\"\" //_Z_ //\n";
}
else {
print $word."\n ".$word." //_Z_ //\n";
}
}
else {
print $word."\n".$reads; # word and readings
}
$word = "";
$reads = "";
next;
}
if (/^\t\t\t\t\t\t\t\t\{/) { # 8 { reading starts
$gi = "";
$ending = "";
$form = "";
$pos = "";
$lemma = "";
next;
}
if (/^\t\t\t\t\t\t\t\t\}/) { # 8 { reading ends
if ($pos eq "Z") {
$read = " ".$lemma." //_".$pos."_ //\n";
}
else {
$read = " ".$lemma."+".$ending.$gi." //_".$pos."_ ".$form." //\n";
}
$reads = $reads.$read;
next;
}
if (/^\t\t\t\t\t\t\t\t\t"clitic":/) { # 9 "clitic": "",
s/^\t\t\t\t\t\t\t\t\t"clitic": "(.*)",/$1/g;
$gi = $_;
next;
}
if (/^\t\t\t\t\t\t\t\t\t"ending":/) { # 9 "ending": "",
s/^\t\t\t\t\t\t\t\t\t"ending": "(.*)",/$1/g;
$ending = $_;
next;
}
if (/^\t\t\t\t\t\t\t\t\t"form":/) { # 9 "form": "",
s/^\t\t\t\t\t\t\t\t\t"form": "(.*)",/$1/g;
$form = $_;
next;
}
if (/^\t\t\t\t\t\t\t\t\t\"partofspeech\":/) { # 9 "partofspeech": "",
s/^\t\t\t\t\t\t\t\t\t\"partofspeech\": "(.*)",/$1/g;
$pos = $_;
next;
}
if (/^\t\t\t\t\t\t\t\t\t"root":/) { # 9 "root": "",
s/^\t\t\t\t\t\t\t\t\t"root": "(.*)"/$1/g;
$lemma = $_;
}
}
}
#!/usr/bin/perl -w
# arvestatakse, et reavahetus on lõigupiir ja ka lausepiir
use utf8;
use open qw(:std :utf8);
use locale;
my $rida = "";
while (<>){
chomp;
s:\”:":g;
s:\“:":g;
s:«:":g;
s:»:":g;
$rida = $_;
$rida=convert_umlauts($rida);
$rida =~ s:([^>]+)$:$1 </s> :g; #v\n
$rida =~ s:^([^<]):<s>$1:g; #\nv
$rida =~ s:([a-zõäöü]\.) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g; #v. S
$rida =~ s:(\)\.) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g; #). S
$rida =~ s:(\273\.) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g; #". S
$rida =~ s:(\.\273) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g; #.". S
$rida =~ s:([a-zõäöü]\.)[ ]*(\253[A-ZÕÄÖÜ]):$1</s> <s>$2:g; #v. "S
$rida =~ s:([a-zõäöü]\.) ("\“[A-ZÕÄÖÜ]):$1</s> <s>$2:g; #v. "S
$rida =~ s:([a-zõäöü]\.) (\253\253[A-ZÕÄÖÜ]):$1</s> <s>$2:g; #v. ""S
if ($rida =~ m:[a-zõäöü]\.\s+[0-9]:){ #v. N
if ($rida !~ m:nr\.:){
$rida =~ s:([a-zõäöü]\.) ([0-9]):$1</s> <s>$2:g;
}
}
$rida =~ s:([0-9]%\.) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g; #N%. S
$rida =~ s:([0-9]\.) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g; #N. S
$rida =~ s:([a-zõäöü][!\?]\273) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g; #v!" S
# hüüu- ega küsimärk ei ole tsitaadis
if ($rida !~ m:\253:){
$rida =~ s:([a-zõäöü][!\?]) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g; #v! S
$rida =~ s:([a-zõäöü][!\?]) ([0-9]):$1</s> <s>$2:g; #v! N
$rida =~ s:([a-zõäöü]\?!) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g; #v?! S
}
else {
$rida =~ s:(\: \253[^\273]*[\.!\?]\273) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g;
while($rida =~ s:(<s>[^\253]*?[a-zõäöü][!\?]) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g){}
}
$rida =~ s:(<s>\253[^\<\273]*?)(</s> <s>):$1\273$2\253:g; #<">< => <""><"
#$rida = convert_back($rida);
# normaliseeri morfanalüsaatori jaoks jutumärgid
$rida =~ s:([^ ])\" :$1 ":g;
$rida =~ s:\"([^ ]):" $1:g;
$rida =~ s:([^ ])»:$1 ”:g;
$rida =~ s:«([^ ]):“ $1:g;
$rida =~ s:([^ ])”:$1 ”:g;
$rida =~ s:\“([^ ]):“ $1:g;
$rida =~ s:([^ ])\):$1 ):g;
$rida =~ s:\(([^ ]):( $1:g;
$rida =~ s:>([^ ]):> $1:g;
$rida =~ s:([^ ])<:$1 <:g;
$rida =~ s:([^ ])([\.,!?;\-\:\"\)\(«»“”][\.?!][\.!?]) :$1 $2 :g; # 3.. . teeb
$rida =~ s:([^ ])([\.,!?;\-\:\"\)\(«»“”][\.!?]) :$1 $2 :g;
$rida =~ s:([^ ])([\.,!?;\-\:\"\)\(«»“”]) :$1 $2 :g;
#numbrid
# $rida =~ s:([^\d])\. :$1 . :g;
$rida =~ s:(\d+) (\d+) (\d+):$1$2$3:g;
$rida =~ s:(\d+) (\d+):$1$2:g;
# $rida =~ s:([^ ])([\.,!?;\-\:\"\)\(«»“”])$:$1 $2 :g;
# $rida =~ s:(.*)$:$1 !!! :g;
$rida =~ s:([^ ])([\.,!?;\-\:\"\)\(«»“”][\.!?][\.!?].) </s> $:$1 $2 </s> :g; #v\n
$rida =~ s:([^ ])([\.,!?;\-\:\"\)\(«»“”][\.!?].) </s> $:$1 $2 </s> :g; #v\n
$rida =~ s:([^ ])([\.,!?;\-\:\"\)\(«»“”].) </s> $:$1 $2 </s> :g; #v\n
$rida =~ s:([^ ])([\.,!?;\-\:\"\)\(«»“”]) </s> $:$1 $2 </s> :g; #v\n
$rida =~ s:(\d+)\s+[.](\s+[^<]*):$1.$2:g; #järgarvud
$rida =~ s:(\d+)[.]\s+a\s+[.]:$1. a.:g;
#kohtuotsuste eri
$rida =~ s: p\s*. </s>\s+<s>\s+(\d): p. $1:g;
$rida =~ s:[)](\S+):) $1:g;
$rida =~ s: s : 's' :g;
$rida =~ s: : :g;
$rida =~ s:\s+: :g;
$rida =~ s:</s>\s+$:</s> :g;
if ($rida =~ m/^\s*$/) {next;}
$rida =~ s: :\n:g;
print $rida;
}
sub convert_umlauts{
my $l=$_[0];
$l =~ s/&auml;/ä/g;
$l =~ s/&ouml;/ö/g;
$l =~ s/&uuml;/ü/g;
$l =~ s/&otilde;/õ/g;
$l =~ s/&Auml;/Ä/g;
$l =~ s/&Ouml;/Ö/g;
$l =~ s/&Uuml;/Ü/g;
$l =~ s/&Otilde;/Õ/g;
$l =~ s/&raquo;/\273/g;
$l =~ s/&laquo;/\253/g;
$l =~ s/&rdquo;/\273/g;
$l =~ s/&ldquo;/\253/g;
return $l;
}
sub convert_back{
my $l=$_[0];
$l =~ s/ä/&auml;/g;
$l =~ s/ö/&ouml;/g;
$l =~ s/ü/&uuml;/g;
$l =~ s/õ/&otilde;/g;
$l =~ s/Ä/&Auml;/g;
$l =~ s/Ö/&Ouml;/g;
$l =~ s/Ü/&Uuml;/g;
$l =~ s/Õ/&Otilde;/g;
$l =~ s/\273/&raquo;/g;
$l =~ s/\253/&laquo;/g;
return $l;
}
#!/usr/bin/perl -w
use utf8;
use open qw(:std :utf8);
open(TABLE,"< /wrapper/EstCG/tmorftrtabel.txt")
or die "rtolkija viga, näita tmorftrtabel rada: $!\n";
while(<TABLE>){
if (/^[^¤]+/) {
chomp;
@r = split(/@/);
push(@tabel,[@r]);
}
}
close(TABLE);
#for ($i=0;$i<$#tabel;$i++){
# print $tabel[$i][1],"\n";
#}
$lipp=0;
my $cap=0;
while(<>){
chomp;
if (/^[^ ]+/) { #sõnavormirida
s/^\\"/"/g;
print;
print "\n";
if (/^[A-ZÕÄÖÜŽŠ]+/) {
$cap=1;
} else {
$cap=0;
}
next;
}
$tolgendus=$_;
$tolgendus=~s/…([\+0]*) \/\/_Z_ \/\//…$1 \/\/_Z_ Ell \/\//;
$tolgendus=~s/\.\.\.([\+0]*) \/\/_Z_ \/\//\.\.\.$1 \/\/_Z_ Ell \/\//;
$tolgendus=~s/\.\.([\+0]*) \/\/_Z_ \/\//\.\.$1 \/\/_Z_ Els \/\//;
$tolgendus=~s/\.([\+0]*) \/\/_Z_ \/\//\.$1 \/\/_Z_ Fst \/\//;
$tolgendus=~s/,([\+0]*) \/\/_Z_ \/\//,$1 \/\/_Z_ Com \/\//;
$tolgendus=~s/:([\+0]*) \/\/_Z_ \/\//:$1 \/\/_Z_ Col \/\//;
$tolgendus=~s/;([\+0]*) \/\/_Z_ \/\//;$1 \/\/_Z_ Scl \/\//;
$tolgendus=~s/\?([\+0]*) \/\/_Z_ \/\//\?$1 \/\/_Z_ Int \/\//;
$tolgendus=~s/\!([\+0]*) \/\/_Z_ \/\//\!$1 \/\/_Z_ Exc \/\//;
$tolgendus=~s/--([\+0]*) \/\/_Z_ \/\//--$1 \/\/_Z_ Dsd \/\//;
$tolgendus=~s/-([\+0]*) \/\/_Z_ \/\//-$1 \/\/_Z_ Dsh \/\//;
$tolgendus=~s/\(([\+0]*) \/\/_Z_ \/\//\($1 \/\/_Z_ Opr \/\//;
$tolgendus=~s/\)([\+0]*) \/\/_Z_ \/\//\)$1 \/\/_Z_ Cpr \/\//;
$tolgendus=~s:\\"\s+//_Z_ //:" //_Z_ Quo //:g;
$tolgendus=~s/«([\+0]*) \/\/_Z_ \/\//«$1 \/\/_Z_ Oqu \/\//;
$tolgendus=~s/»([\+0]*) \/\/_Z_ \/\//»$1 \/\/_Z_ Cqu \/\//;
$tolgendus=~s/“([\+0]*) \/\/_Z_ \/\//“$1 \/\/_Z_ Oqu \/\//; #E2 80 9C
$tolgendus=~s/”([\+0]*) \/\/_Z_ \/\//”$1 \/\/_Z_ Cqu \/\//; #E2 80 9D
$tolgendus=~s/<([\+0]*) \/\/_Z_ \/\//<$1 \/\/_Z_ Grt \/\//;
$tolgendus=~s/>([\+0]*) \/\/_Z_ \/\//>$1 \/\/_Z_ Sml \/\//;
$tolgendus=~s/\[([\+0]*) \/\/_Z_ \/\//\[$1 \/\/_Z_ Osq \/\//;
$tolgendus=~s/\]([\+0]*) \/\/_Z_ \/\//\]$1 \/\/_Z_ Csq \/\//;
$tolgendus=~s/\/([\+0]*) \/\/_Z_ \/\//\/$1 \/\/_Z_ Sla \/\//;
$tolgendus=~s/\=([\+0]*) \/\/_Z_ \/\//\= \/\/_Z_ \/\//;
$tolgendus=~s/\+([\+0]*) \/\/_Z_ \/\//\+ \/\/_Z_ \/\//;
$tolgendus=~s/\&[\+0]* \/\/_\S_ .*$/& \/\/_Y_ \/\//;
$tolgendus=~s/\%[\+0]* \/\/_\S_ .*/% \/\/_Y_ \/\//;
if ($tolgendus =~ /_Z_/) { print $tolgendus,"\n"; next; }
#if ($tolgendus =~ /(.*)\s+\/\/(_._) (.*)\/\/(.*)/){
if ($tolgendus =~ /(.*)\s+\/\/(_._) (.*)\/\//){
$root=$1 ;
$pos=$2 ;
@inf=split(/,/,$3);
#$eki=$4 ;
#print $1 ,"X" ,$2 ,"X", $3 ,"X" ,$4 ,"X";
#print ">",@inf,"<";
foreach $m (@inf){
#print "\n=",$m;
$m =~ s/\s+/ /g;
$m =~ s/^\s+//g;
next if ($m=~/^\s*$/);
$morf=$pos." ".$m;
$morf=~s/(.*)\s+$/$1/g;
$j=0;$lipp=0;
foreach $rida (@tabel){
if ($morf eq $rida->[1]) {
$m2=$morf;
$morf=~ s/$rida->[1]$/$rida->[3]/;
$morf=~ s/$rida->[1] \?$/$rida->[3]/;
# oli: $morf=~ s/$rida->[1]$/$rida->[3]/;
# ei teisendanud ?-ga lõppevaid ridu, nt "_N_ ?"
$morf=~s/ \?/ \#?/;
if ($cap) {
$morf .= " cap";
}
print $root." //".$morf." //\n";
$morf=$m2; #last;
$lipp++;
}
$j++;
}
if ($lipp==0) {
if ($cap) {
$morf .= " cap";
}
print $root." //".$morf." //\n"; $lipp=0;
}
}
if ($3=~/^\s*$/) {
$morf=$pos;
foreach $rida (@tabel){
if ($morf =~ /$rida->[1]/){
$m2=$morf;
# oli: $morf=~ s/$rida->[1]$/$rida->[3]/;
$morf=~ s/$rida->[1]/$rida->[3]/;
$morf=~s/ \?/ \#?/;
#print "$root"; print "$morf"; print "2";
if ($cap) {
$morf .= " cap";
}
print $root." //".$morf." //\n";
$morf=$m2; }
}
}
# $tolgendus=" ".$root." //".$morf." //";
}
else { print $tolgendus,"\n"; }
#{print $tolgendus,"\n"; }
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment