Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
K
kl-lausestaja
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Package Registry
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
keeleliin
kl-lausestaja
Commits
7bd79bf5
Commit
7bd79bf5
authored
Apr 14, 2019
by
Indrek Jentson
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Lisatud uuendatud perl'i skriptid
parent
2a462055
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
355 additions
and
1 deletion
+355
-1
Dockerfile
Dockerfile
+4
-1
EstCGFix/json2mrf.pl
EstCGFix/json2mrf.pl
+88
-0
EstCGFix/rlausestaja.pl
EstCGFix/rlausestaja.pl
+129
-0
EstCGFix/rtolkija.pl
EstCGFix/rtolkija.pl
+134
-0
No files found.
Dockerfile
View file @
7bd79bf5
# DOCKERi skript KL-lausestaja ehitamiseks
FROM
debian:
buster
FROM
debian:
oldstable
# Kui Debianil pole vaikimisi kaasas GCC vajalikke teeke vms, siis tuleb need ka lasta installida.
...
...
@@ -34,6 +34,9 @@ RUN apt-get -y install perl && \
COPY
config_dist.js /wrapper/config.js
COPY
command.sh /wrapper
COPY
EstCGFix/json2mrf.pl /wrapper/EstCG/json2mrf.pl
COPY
EstCGFix/rtolkija.pl /wrapper/EstCG/rtolkija.pl
COPY
EstCGFix/rlausestaja.pl /wrapper/EstCG/rlausestaja.pl
# Tarkvara X paigaldamine lõppeb
...
...
EstCGFix/json2mrf.pl
0 → 100755
View file @
7bd79bf5
#!/usr/bin/perl
use
utf8
;
use
open
qw(:std :utf8)
;
my
$s
=
0
;
my
$word
=
"";
my
$read
=
"";
my
$reads
=
"";
my
$ending
=
"";
my
$form
=
"";
my
$pos
=
"";
my
$lemma
=
"";
while
(
<>
){
#s/[\\]n/\n/g; #vabamorfi bugi
chomp
;
if
(
/^\t\t\t\t\{/
){
# sentence
$s
=
1
;
print
"
<s>
\n
";
}
if
(
/^\t\t\t\t\}/
){
# sentence
$s
=
0
;
print
"
</s>
\n
";
}
if
(
$s
==
1
)
{
if
(
/^\t\t\t\t\t\t\t"text":/
)
{
# 7 "text": "Kes"
s/^\t\t\t\t\t\t\t"text": "(.*)"$/$1/g
;
$word
=
$_
;
if
(
$reads
eq
"")
{
if
(
$word
eq
'
\"
'
or
$word
eq
'
\\
"
'
or
$word
eq
'
\\
'
or
$word
eq
'
\\\\
"
')
{
print
"
\"\n
\"\"\"
//_Z_ //
\n
";
}
else
{
print
$word
.
"
\n
"
.
$word
.
"
//_Z_ //
\n
";
}
}
else
{
print
$word
.
"
\n
"
.
$reads
;
# word and readings
}
$word
=
"";
$reads
=
"";
next
;
}
if
(
/^\t\t\t\t\t\t\t\t\{/
)
{
# 8 { reading starts
$gi
=
"";
$ending
=
"";
$form
=
"";
$pos
=
"";
$lemma
=
"";
next
;
}
if
(
/^\t\t\t\t\t\t\t\t\}/
)
{
# 8 { reading ends
if
(
$pos
eq
"
Z
")
{
$read
=
"
"
.
$lemma
.
"
//_
"
.
$pos
.
"
_ //
\n
";
}
else
{
$read
=
"
"
.
$lemma
.
"
+
"
.
$ending
.
$gi
.
"
//_
"
.
$pos
.
"
_
"
.
$form
.
"
//
\n
";
}
$reads
=
$reads
.
$read
;
next
;
}
if
(
/^\t\t\t\t\t\t\t\t\t"clitic":/
)
{
# 9 "clitic": "",
s/^\t\t\t\t\t\t\t\t\t"clitic": "(.*)",/$1/g
;
$gi
=
$_
;
next
;
}
if
(
/^\t\t\t\t\t\t\t\t\t"ending":/
)
{
# 9 "ending": "",
s/^\t\t\t\t\t\t\t\t\t"ending": "(.*)",/$1/g
;
$ending
=
$_
;
next
;
}
if
(
/^\t\t\t\t\t\t\t\t\t"form":/
)
{
# 9 "form": "",
s/^\t\t\t\t\t\t\t\t\t"form": "(.*)",/$1/g
;
$form
=
$_
;
next
;
}
if
(
/^\t\t\t\t\t\t\t\t\t\"partofspeech\":/
)
{
# 9 "partofspeech": "",
s/^\t\t\t\t\t\t\t\t\t\"partofspeech\": "(.*)",/$1/g
;
$pos
=
$_
;
next
;
}
if
(
/^\t\t\t\t\t\t\t\t\t"root":/
)
{
# 9 "root": "",
s/^\t\t\t\t\t\t\t\t\t"root": "(.*)"/$1/g
;
$lemma
=
$_
;
}
}
}
EstCGFix/rlausestaja.pl
0 → 100755
View file @
7bd79bf5
#!/usr/bin/perl -w
# arvestatakse, et reavahetus on lõigupiir ja ka lausepiir
use
utf8
;
use
open
qw(:std :utf8)
;
use
locale
;
my
$rida
=
"";
while
(
<>
){
chomp
;
s:\”:":g
;
s:\“:":g
;
s:«:":g
;
s:»:":g
;
$rida
=
$_
;
$rida
=
convert_umlauts
(
$rida
);
$rida
=~
s:([^>]+)$:$1 </s> :g
;
#v\n
$rida
=~
s:^([^<]):<s>$1:g
;
#\nv
$rida
=~
s:([a-zõäöü]\.) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g
;
#v. S
$rida
=~
s:(\)\.) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g
;
#). S
$rida
=~
s:(\273\.) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g
;
#". S
$rida
=~
s:(\.\273) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g
;
#.". S
$rida
=~
s:([a-zõäöü]\.)[ ]*(\253[A-ZÕÄÖÜ]):$1</s> <s>$2:g
;
#v. "S
$rida
=~
s:([a-zõäöü]\.) ("\“[A-ZÕÄÖÜ]):$1</s> <s>$2:g
;
#v. "S
$rida
=~
s:([a-zõäöü]\.) (\253\253[A-ZÕÄÖÜ]):$1</s> <s>$2:g
;
#v. ""S
if
(
$rida
=~
m:[a-zõäöü]\.\s+[0-9]:
){
#v. N
if
(
$rida
!~
m:nr\.:
){
$rida
=~
s:([a-zõäöü]\.) ([0-9]):$1</s> <s>$2:g
;
}
}
$rida
=~
s:([0-9]%\.) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g
;
#N%. S
$rida
=~
s:([0-9]\.) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g
;
#N. S
$rida
=~
s:([a-zõäöü][!\?]\273) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g
;
#v!" S
# hüüu- ega küsimärk ei ole tsitaadis
if
(
$rida
!~
m:\253:
){
$rida
=~
s:([a-zõäöü][!\?]) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g
;
#v! S
$rida
=~
s:([a-zõäöü][!\?]) ([0-9]):$1</s> <s>$2:g
;
#v! N
$rida
=~
s:([a-zõäöü]\?!) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g
;
#v?! S
}
else
{
$rida
=~
s:(\: \253[^\273]*[\.!\?]\273) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g
;
while
(
$rida
=~
s:(<s>[^\253]*?[a-zõäöü][!\?]) ([A-ZÕÄÖÜ]):$1</s> <s>$2:g
){}
}
$rida
=~
s:(<s>\253[^\<\273]*?)(</s> <s>):$1\273$2\253:g
;
#<">< => <""><"
#$rida = convert_back($rida);
# normaliseeri morfanalüsaatori jaoks jutumärgid
$rida
=~
s:([^ ])\" :$1 ":g
;
$rida
=~
s:\"([^ ]):" $1:g
;
$rida
=~
s:([^ ])»:$1 ”:g
;
$rida
=~
s:«([^ ]):“ $1:g
;
$rida
=~
s:([^ ])”:$1 ”:g
;
$rida
=~
s:\“([^ ]):“ $1:g
;
$rida
=~
s:([^ ])\):$1 ):g
;
$rida
=~
s:\(([^ ]):( $1:g
;
$rida
=~
s:>([^ ]):> $1:g
;
$rida
=~
s:([^ ])<:$1 <:g
;
$rida
=~
s:([^ ])([\.,!?;\-\:\"\)\(«»“”][\.?!][\.!?]) :$1 $2 :g
;
# 3.. . teeb
$rida
=~
s:([^ ])([\.,!?;\-\:\"\)\(«»“”][\.!?]) :$1 $2 :g
;
$rida
=~
s:([^ ])([\.,!?;\-\:\"\)\(«»“”]) :$1 $2 :g
;
#numbrid
# $rida =~ s:([^\d])\. :$1 . :g;
$rida
=~
s:(\d+) (\d+) (\d+):$1$2$3:g
;
$rida
=~
s:(\d+) (\d+):$1$2:g
;
# $rida =~ s:([^ ])([\.,!?;\-\:\"\)\(«»“”])$:$1 $2 :g;
# $rida =~ s:(.*)$:$1 !!! :g;
$rida
=~
s:([^ ])([\.,!?;\-\:\"\)\(«»“”][\.!?][\.!?].) </s> $:$1 $2 </s> :g
;
#v\n
$rida
=~
s:([^ ])([\.,!?;\-\:\"\)\(«»“”][\.!?].) </s> $:$1 $2 </s> :g
;
#v\n
$rida
=~
s:([^ ])([\.,!?;\-\:\"\)\(«»“”].) </s> $:$1 $2 </s> :g
;
#v\n
$rida
=~
s:([^ ])([\.,!?;\-\:\"\)\(«»“”]) </s> $:$1 $2 </s> :g
;
#v\n
$rida
=~
s:(\d+)\s+[.](\s+[^<]*):$1.$2:g
;
#järgarvud
$rida
=~
s:(\d+)[.]\s+a\s+[.]:$1. a.:g
;
#kohtuotsuste eri
$rida
=~
s: p\s*. </s>\s+<s>\s+(\d): p. $1:g
;
$rida
=~
s:[)](\S+):) $1:g
;
$rida
=~
s: s : 's' :g
;
$rida
=~
s: : :g
;
$rida
=~
s:\s+: :g
;
$rida
=~
s:</s>\s+$:</s> :g
;
if
(
$rida
=~
m/^\s*$/
)
{
next
;}
$rida
=~
s: :\n:g
;
print
$rida
;
}
sub
convert_umlauts
{
my
$l
=
$_
[
0
];
$l
=~
s/ä/ä/g
;
$l
=~
s/ö/ö/g
;
$l
=~
s/ü/ü/g
;
$l
=~
s/õ/õ/g
;
$l
=~
s/Ä/Ä/g
;
$l
=~
s/Ö/Ö/g
;
$l
=~
s/Ü/Ü/g
;
$l
=~
s/Õ/Õ/g
;
$l
=~
s/»/\273/g
;
$l
=~
s/«/\253/g
;
$l
=~
s/”/\273/g
;
$l
=~
s/“/\253/g
;
return
$l
;
}
sub
convert_back
{
my
$l
=
$_
[
0
];
$l
=~
s/ä/ä/g
;
$l
=~
s/ö/ö/g
;
$l
=~
s/ü/ü/g
;
$l
=~
s/õ/õ/g
;
$l
=~
s/Ä/Ä/g
;
$l
=~
s/Ö/Ö/g
;
$l
=~
s/Ü/Ü/g
;
$l
=~
s/Õ/Õ/g
;
$l
=~
s/\273/»/g
;
$l
=~
s/\253/«/g
;
return
$l
;
}
EstCGFix/rtolkija.pl
0 → 100755
View file @
7bd79bf5
#!/usr/bin/perl -w
use
utf8
;
use
open
qw(:std :utf8)
;
open
(
TABLE
,"
< /wrapper/EstCG/tmorftrtabel.txt
")
or
die
"
rtolkija viga, näita tmorftrtabel rada: $!
\n
";
while
(
<
TABLE
>
){
if
(
/^[^¤]+/
)
{
chomp
;
@r
=
split
(
/@/
);
push
(
@tabel
,[
@r
]);
}
}
close
(
TABLE
);
#for ($i=0;$i<$#tabel;$i++){
# print $tabel[$i][1],"\n";
#}
$lipp
=
0
;
my
$cap
=
0
;
while
(
<>
){
chomp
;
if
(
/^[^ ]+/
)
{
#sõnavormirida
s/^\\"/"/g
;
print
;
print
"
\n
";
if
(
/^[A-ZÕÄÖÜŽŠ]+/
)
{
$cap
=
1
;
}
else
{
$cap
=
0
;
}
next
;
}
$tolgendus
=
$_
;
$tolgendus
=~
s/…([\+0]*) \/\/_Z_ \/\//…$1 \/\/_Z_ Ell \/\//
;
$tolgendus
=~
s/\.\.\.([\+0]*) \/\/_Z_ \/\//\.\.\.$1 \/\/_Z_ Ell \/\//
;
$tolgendus
=~
s/\.\.([\+0]*) \/\/_Z_ \/\//\.\.$1 \/\/_Z_ Els \/\//
;
$tolgendus
=~
s/\.([\+0]*) \/\/_Z_ \/\//\.$1 \/\/_Z_ Fst \/\//
;
$tolgendus
=~
s/,([\+0]*) \/\/_Z_ \/\//,$1 \/\/_Z_ Com \/\//
;
$tolgendus
=~
s/:([\+0]*) \/\/_Z_ \/\//:$1 \/\/_Z_ Col \/\//
;
$tolgendus
=~
s/;([\+0]*) \/\/_Z_ \/\//;$1 \/\/_Z_ Scl \/\//
;
$tolgendus
=~
s/\?([\+0]*) \/\/_Z_ \/\//\?$1 \/\/_Z_ Int \/\//
;
$tolgendus
=~
s/\!([\+0]*) \/\/_Z_ \/\//\!$1 \/\/_Z_ Exc \/\//
;
$tolgendus
=~
s/--([\+0]*) \/\/_Z_ \/\//--$1 \/\/_Z_ Dsd \/\//
;
$tolgendus
=~
s/-([\+0]*) \/\/_Z_ \/\//-$1 \/\/_Z_ Dsh \/\//
;
$tolgendus
=~
s/\(([\+0]*) \/\/_Z_ \/\//\($1 \/\/_Z_ Opr \/\//
;
$tolgendus
=~
s/\)([\+0]*) \/\/_Z_ \/\//\)$1 \/\/_Z_ Cpr \/\//
;
$tolgendus
=~
s:\\"\s+//_Z_ //:" //_Z_ Quo //:g
;
$tolgendus
=~
s/«([\+0]*) \/\/_Z_ \/\//«$1 \/\/_Z_ Oqu \/\//
;
$tolgendus
=~
s/»([\+0]*) \/\/_Z_ \/\//»$1 \/\/_Z_ Cqu \/\//
;
$tolgendus
=~
s/“([\+0]*) \/\/_Z_ \/\//“$1 \/\/_Z_ Oqu \/\//
;
#E2 80 9C
$tolgendus
=~
s/”([\+0]*) \/\/_Z_ \/\//”$1 \/\/_Z_ Cqu \/\//
;
#E2 80 9D
$tolgendus
=~
s/<([\+0]*) \/\/_Z_ \/\//<$1 \/\/_Z_ Grt \/\//
;
$tolgendus
=~
s/>([\+0]*) \/\/_Z_ \/\//>$1 \/\/_Z_ Sml \/\//
;
$tolgendus
=~
s/\[([\+0]*) \/\/_Z_ \/\//\[$1 \/\/_Z_ Osq \/\//
;
$tolgendus
=~
s/\]([\+0]*) \/\/_Z_ \/\//\]$1 \/\/_Z_ Csq \/\//
;
$tolgendus
=~
s/\/([\+0]*) \/\/_Z_ \/\//\/$1 \/\/_Z_ Sla \/\//
;
$tolgendus
=~
s/\=([\+0]*) \/\/_Z_ \/\//\= \/\/_Z_ \/\//
;
$tolgendus
=~
s/\+([\+0]*) \/\/_Z_ \/\//\+ \/\/_Z_ \/\//
;
$tolgendus
=~
s/\&[\+0]* \/\/_\S_ .*$/& \/\/_Y_ \/\//
;
$tolgendus
=~
s/\%[\+0]* \/\/_\S_ .*/% \/\/_Y_ \/\//
;
if
(
$tolgendus
=~
/_Z_/
)
{
print
$tolgendus
,"
\n
";
next
;
}
#if ($tolgendus =~ /(.*)\s+\/\/(_._) (.*)\/\/(.*)/){
if
(
$tolgendus
=~
/(.*)\s+\/\/(_._) (.*)\/\//
){
$root
=
$
1
;
$pos
=
$
2
;
@inf
=
split
(
/,/
,
$
3
);
#$eki=$4 ;
#print $1 ,"X" ,$2 ,"X", $3 ,"X" ,$4 ,"X";
#print ">",@inf,"<";
foreach
$m
(
@inf
){
#print "\n=",$m;
$m
=~
s/\s+/ /g
;
$m
=~
s/^\s+//g
;
next
if
(
$m
=~
/^\s*$/
);
$morf
=
$pos
.
"
"
.
$m
;
$morf
=~
s/(.*)\s+$/$1/g
;
$j
=
0
;
$lipp
=
0
;
foreach
$rida
(
@tabel
){
if
(
$morf
eq
$rida
->
[
1
])
{
$m2
=
$morf
;
$morf
=~
s/$rida->[1]$/$rida->[3]/
;
$morf
=~
s/$rida->[1] \?$/$rida->[3]/
;
# oli: $morf=~ s/$rida->[1]$/$rida->[3]/;
# ei teisendanud ?-ga lõppevaid ridu, nt "_N_ ?"
$morf
=~
s/ \?/ \#?/
;
if
(
$cap
)
{
$morf
.=
"
cap
";
}
print
$root
.
"
//
"
.
$morf
.
"
//
\n
";
$morf
=
$m2
;
#last;
$lipp
++
;
}
$j
++
;
}
if
(
$lipp
==
0
)
{
if
(
$cap
)
{
$morf
.=
"
cap
";
}
print
$root
.
"
//
"
.
$morf
.
"
//
\n
";
$lipp
=
0
;
}
}
if
(
$
3
=~
/^\s*$/
)
{
$morf
=
$pos
;
foreach
$rida
(
@tabel
){
if
(
$morf
=~
/$rida->[1]/
){
$m2
=
$morf
;
# oli: $morf=~ s/$rida->[1]$/$rida->[3]/;
$morf
=~
s/$rida->[1]/$rida->[3]/
;
$morf
=~
s/ \?/ \#?/
;
#print "$root"; print "$morf"; print "2";
if
(
$cap
)
{
$morf
.=
"
cap
";
}
print
$root
.
"
//
"
.
$morf
.
"
//
\n
";
$morf
=
$m2
;
}
}
}
# $tolgendus=" ".$root." //".$morf." //";
}
else
{
print
$tolgendus
,"
\n
";
}
#{print $tolgendus,"\n"; }
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment