Commit b6d93676 authored by rabauti's avatar rabauti
Browse files

korpus_xml failide elementidele id lisamine

parent 66d97a6c
#/usr/bin/python
# -*- coding: utf-8 -*-
import os
import sys
import pathlib
import re
from bs4 import BeautifulSoup
from pathlib import Path
#lähtefailide kataloog
#väljundis säilitame sama struktuuri
indir = '/Users/rabauti/repos/tu/ut_veakorpus/korpus_xml'
outdir = '/Users/rabauti/repos/tu/ut_veakorpus/korpus_ids'
files = os.listdir(indir)
#RP_algtekst_2004/A7.xml
file_pattern_txt = re.compile('([^\/]+).txt$')
for folder in files:
folder = folder.strip()
files2 = os.listdir(indir+'/'+folder)
for name2 in files2:
if file_pattern_txt.search(name2):
#avame faili lugemiseks
filename = indir + '/' + folder + '/' + name2
name2 = name2.replace('Veakorpus','').replace(', uus','').replace(' uus','')
filename2 = (outdir + '/' + folder + '/' + name2)#.replace('.txt', '.xml')
foldername_components = folder.split('_')
name2_components = (name2.split('.'))
corrector = foldername_components[0]
year = foldername_components[2]
student = name2_components[0]
docId = corrector + year + '_' + student
#docId = docId.replace('_algtekst_', '_')
soup = BeautifulSoup(open(filename, 'r').read(), "xml")
path = Path(outdir + '/' + folder)
path.mkdir(parents=True, exist_ok=True)
dokument = soup.find('dokument')
dokument['id'] = docId
for (i, eksimus) in enumerate(dokument.findAll('eksimus')):
eksimus['id'] = '%s_e%d' % (docId, i+1)
for (j, algne) in enumerate(eksimus.findAll('algne')):
algne['id'] = '%s_e%d_a%d' % (docId, i+1, j+1)
for (j, parandus) in enumerate(eksimus.findAll('parandus')):
parandus['id'] = '%s_e%d_p%d' % (docId, i+1, j+1)
for (j, kommentaar) in enumerate(eksimus.findAll('kommentaar')):
kommentaar['id'] = '%s_e%d_k%d' % (docId, i+1, j+1)
file_out = open(filename2, 'w')
outXML = str(soup)
outXML = outXML.replace('<eksimus', '\n<eksimus')
file_out.write(outXML)
file_out.close()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment