Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Krista Liin
TÜveakorpus
Commits
b5ea55d3
Commit
b5ea55d3
authored
Apr 01, 2019
by
rabauti
Browse files
korpuse laused TSV failis: ELEMENDI_ID ELEMENDI_TEXT
parent
e80584d9
Changes
2
Expand all
Hide whitespace changes
Inline
Side-by-side
korpus_tsv/korpus.tsv
0 → 100644
View file @
b5ea55d3
This diff is collapsed.
Click to expand it.
makeTSV.py
0 → 100644
View file @
b5ea55d3
#/usr/bin/python
# -*- coding: utf-8 -*-
#python makeTSV.py | sort -d > korpus_tsv/korpus.tsv
import
os
import
sys
import
pathlib
import
re
from
bs4
import
BeautifulSoup
from
pathlib
import
Path
#lähtefailide kataloog
indir
=
'/Users/rabauti/repos/tu/ut_veakorpus/korpus_ids'
files
=
os
.
listdir
(
indir
)
file_pattern_txt
=
re
.
compile
(
'([^\/]+).txt$'
)
for
folder
in
files
:
if
not
folder
.
startswith
(
'.'
):
folder
=
folder
.
strip
()
files2
=
os
.
listdir
(
indir
+
'/'
+
folder
)
for
name2
in
files2
:
if
file_pattern_txt
.
search
(
name2
):
filename
=
indir
+
'/'
+
folder
+
'/'
+
name2
soup
=
BeautifulSoup
(
open
(
filename
,
'r'
).
read
(),
"xml"
)
dokument
=
soup
.
find
(
'dokument'
)
row_template
=
'%s
\t
%s'
for
(
i
,
eksimus
)
in
enumerate
(
dokument
.
findAll
(
'eksimus'
)):
for
item_name
in
[
'algne'
,
'parandus'
,
'kommentaar'
]:
for
(
j
,
item
)
in
enumerate
(
eksimus
.
findAll
(
item_name
)):
#tühjad kommentaarid jätame välja
if
item_name
==
'kommentaar'
and
not
len
(
item
.
string
.
strip
()):
continue
print
(
row_template
%
(
item
[
'id'
].
strip
(),
item
.
string
.
strip
().
replace
(
'
\n
'
,
' '
)))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment