Commit ae1f1751 authored by rabauti's avatar rabauti
Browse files

korpus jagamiseks sobivale kujule

parent 7d6bb2f7
#/usr/bin/python
# -*- coding: utf-8 -*-
#python makeTSV.py | sort -d > korpus_tsv/korpus.tsv
import os
import sys
import re
from bs4 import BeautifulSoup
#lähtefailide kataloog
indir = '/home/osboxes/repos/tu/ut_veakorpus/korpus_ids'
indir = '/Users/rabauti/repos/tu/ut_veakorpus/korpus_ids'
header = '/Users/rabauti/repos/tu/ut_veakorpus/korpus_public/header.xml'
files = sorted(os.listdir(indir))
file_pattern_txt = re.compile('([^\/]+).txt$')
print ('<?xml version="1.0" encoding="utf-8"?>')
print ('<korpus>')
header_txt = open(header, 'r').read()
print (header_txt)
print ('<tekst>')
for folder in files:
if not folder.startswith('.'):
folder = folder.strip()
files2 = sorted(os.listdir(indir+'/'+folder))
for name2 in files2:
if file_pattern_txt.search(name2):
filename = indir + '/' + folder + '/' + name2
soup = BeautifulSoup(open(filename, 'r').read(), "xml")
header = soup.find('taustainfo')
attr_emakeel = ''
emakeel = soup.find('emakeel')
if emakeel:
attr_emakeel = emakeel.string.strip()
attr_tyyp = ''
tyyp = soup.find('tyyp')
if tyyp:
attr_tyyp = tyyp.string.strip()
attr_tase = ''
tase = soup.find('tase')
if tase:
attr_tase = tase.string.strip()
tekst = soup.find('tekst')
row_template = '%s\t%s'
for (i, eksimus) in enumerate(tekst.findAll('eksimus')):
eksimus['emakeel'] = attr_emakeel
eksimus['tyyp'] = attr_tyyp
eksimus['tase'] = attr_tase
print (eksimus.prettify())
print ('</tekst>')
print ('</korpus>')
python joinXml.py > korpus_public/oppijakeel.xml
<header>
<parandajad></parandajad>
</header>
\ No newline at end of file
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment