Commit 2a4609bc authored by rabauti's avatar rabauti
Browse files

nimi muutunud

parent 63cee48b
#/usr/bin/python
# -*- coding: utf-8 -*-
import os
import sys
import re
import difflib
import copy
from difflib import Differ
from estnltk import Text
originals = {}
corrections = {}
script_name = (os.path.realpath(__file__))
script_dir = os.path.dirname(script_name)
filename = script_dir +'/../korpus_tsv/korpus.tsv'
outdir = script_dir + '/morf'
file_input = open(filename, 'r')
for line in file_input:
line = line.rstrip()
line_arr = line.split('\t')
if not len(line_arr) == 2:
print ('ERROR :' , line)
exit(1)
uid = line_arr[0]
text = line_arr[1]
uid_arr = uid.split('_')
uid_ending = uid_arr.pop()
uid2 = '_'.join(uid_arr)
el_type = uid_ending[0]
if el_type == 'a':
originals[uid2] = {'id': uid, 'text':text}
if el_type == 'p':
if not uid2 in corrections:
corrections[uid2] = []
corrections[uid2].append({'id': uid, 'text':text})
d = difflib.HtmlDiff( tabsize=4, wrapcolumn=40 )
for uid in corrections.keys():
for (i, correction) in enumerate(corrections[uid]):
filename2 = '%s/%s.html' % (outdir, corrections[uid][i]['id'])
#correction = corrections[uid][i]
#print (originals[uid])
#print (correction)
finished = 0
text1_copy = Text(originals[uid]['text'])
text2_copy= Text(correction['text'])
text1_lemmas = text1_copy.lemmas
text2_lemmas = text2_copy.lemmas
text1_word_texts = text1_copy.word_texts
text2_word_texts = text2_copy.word_texts
text1_postags = text1_copy.postags
text2_postags = text2_copy.postags
text1_forms = text1_copy.forms
text2_forms = text2_copy.forms
html_diff_result = d.make_table("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
text1_tsv_rows = []
for (ind, word) in enumerate(text1_word_texts):
text1_tsv_rows.append("\t".join([text1_word_texts[ind], text1_lemmas[ind], text1_postags[ind], text1_forms[ind] ]))
text2_tsv_rows = []
for (ind, word) in enumerate(text2_word_texts):
text2_tsv_rows.append("\t".join([text2_word_texts[ind], text2_lemmas[ind], text2_postags[ind], text2_forms[ind]]))
text1_tsv = ''
html_diff_result += d.make_file("\n".join(text1_tsv_rows).splitlines(1), "\n".join(text2_tsv_rows).splitlines(1))
#html_diff_result += d.make_table("\n".join(text1_lemmas).splitlines(1), "\n".join(text2_lemmas).splitlines(1))
#html_diff_result += d.make_table("\n".join(text1_postags).splitlines(1), "\n".join(text2_postags).splitlines(1))
#html_diff_result += d.make_table("\n".join(text1_forms).splitlines(1), "\n".join(text2_forms).splitlines(1))
file_out = open(filename2, 'w')
file_out.write(html_diff_result)
#file_out.write(html_diff_result)
#file_out.write('</body>')
#file_out.write('</html>')
file_out.close()
#print ( ''.join(unified_diff_result))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment