Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Krista Liin
TÜveakorpus
Commits
b6d93676
Commit
b6d93676
authored
Feb 27, 2019
by
rabauti
Browse files
korpus_xml failide elementidele id lisamine
parent
66d97a6c
Changes
1
Hide whitespace changes
Inline
Side-by-side
addIds.py
0 → 100644
View file @
b6d93676
#/usr/bin/python
# -*- coding: utf-8 -*-
import
os
import
sys
import
pathlib
import
re
from
bs4
import
BeautifulSoup
from
pathlib
import
Path
#lähtefailide kataloog
#väljundis säilitame sama struktuuri
indir
=
'/Users/rabauti/repos/tu/ut_veakorpus/korpus_xml'
outdir
=
'/Users/rabauti/repos/tu/ut_veakorpus/korpus_ids'
files
=
os
.
listdir
(
indir
)
#RP_algtekst_2004/A7.xml
file_pattern_txt
=
re
.
compile
(
'([^\/]+).txt$'
)
for
folder
in
files
:
folder
=
folder
.
strip
()
files2
=
os
.
listdir
(
indir
+
'/'
+
folder
)
for
name2
in
files2
:
if
file_pattern_txt
.
search
(
name2
):
#avame faili lugemiseks
filename
=
indir
+
'/'
+
folder
+
'/'
+
name2
name2
=
name2
.
replace
(
'Veakorpus'
,
''
).
replace
(
', uus'
,
''
).
replace
(
' uus'
,
''
)
filename2
=
(
outdir
+
'/'
+
folder
+
'/'
+
name2
)
#.replace('.txt', '.xml')
foldername_components
=
folder
.
split
(
'_'
)
name2_components
=
(
name2
.
split
(
'.'
))
corrector
=
foldername_components
[
0
]
year
=
foldername_components
[
2
]
student
=
name2_components
[
0
]
docId
=
corrector
+
year
+
'_'
+
student
#docId = docId.replace('_algtekst_', '_')
soup
=
BeautifulSoup
(
open
(
filename
,
'r'
).
read
(),
"xml"
)
path
=
Path
(
outdir
+
'/'
+
folder
)
path
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
dokument
=
soup
.
find
(
'dokument'
)
dokument
[
'id'
]
=
docId
for
(
i
,
eksimus
)
in
enumerate
(
dokument
.
findAll
(
'eksimus'
)):
eksimus
[
'id'
]
=
'%s_e%d'
%
(
docId
,
i
+
1
)
for
(
j
,
algne
)
in
enumerate
(
eksimus
.
findAll
(
'algne'
)):
algne
[
'id'
]
=
'%s_e%d_a%d'
%
(
docId
,
i
+
1
,
j
+
1
)
for
(
j
,
parandus
)
in
enumerate
(
eksimus
.
findAll
(
'parandus'
)):
parandus
[
'id'
]
=
'%s_e%d_p%d'
%
(
docId
,
i
+
1
,
j
+
1
)
for
(
j
,
kommentaar
)
in
enumerate
(
eksimus
.
findAll
(
'kommentaar'
)):
kommentaar
[
'id'
]
=
'%s_e%d_k%d'
%
(
docId
,
i
+
1
,
j
+
1
)
file_out
=
open
(
filename2
,
'w'
)
outXML
=
str
(
soup
)
outXML
=
outXML
.
replace
(
'<eksimus'
,
'
\n
<eksimus'
)
file_out
.
write
(
outXML
)
file_out
.
close
()
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment