Commit b5ea55d3 authored by rabauti's avatar rabauti
Browse files

korpuse laused TSV failis: ELEMENDI_ID ELEMENDI_TEXT

parent e80584d9
This diff is collapsed.
#/usr/bin/python
# -*- coding: utf-8 -*-
#python makeTSV.py | sort -d > korpus_tsv/korpus.tsv
import os
import sys
import pathlib
import re
from bs4 import BeautifulSoup
from pathlib import Path
#lähtefailide kataloog
indir = '/Users/rabauti/repos/tu/ut_veakorpus/korpus_ids'
files = os.listdir(indir)
file_pattern_txt = re.compile('([^\/]+).txt$')
for folder in files:
if not folder.startswith('.'):
folder = folder.strip()
files2 = os.listdir(indir+'/'+folder)
for name2 in files2:
if file_pattern_txt.search(name2):
filename = indir + '/' + folder + '/' + name2
soup = BeautifulSoup(open(filename, 'r').read(), "xml")
dokument = soup.find('dokument')
row_template = '%s\t%s'
for (i, eksimus) in enumerate(dokument.findAll('eksimus')):
for item_name in ['algne', 'parandus', 'kommentaar']:
for (j, item) in enumerate(eksimus.findAll(item_name)):
#tühjad kommentaarid jätame välja
if item_name == 'kommentaar' and not len(item.string.strip()):
continue
print (row_template % (item['id'].strip(), item.string.strip().replace('\n', ' ')))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment