Commit f9a85e87 authored by rabauti's avatar rabauti
Browse files

skript algteksti ja paranduse võrdlemiseks ning vigade liigitamiseks (loogika veel veidi pooleli)

parent df966b65
#/usr/bin/python
# -*- coding: utf-8 -*-
import os
import sys
import re
import difflib
import copy
from estnltk import Text
from difflib import Differ
#proovime mõned lihtsamad vead tuvastada
#Lähtefail ../korpus_tsv
#loeme kõik laused sisse, teeme dict
#item{'senetenceID'}['a'] = {'id': 'sentence'}
#item{'senetenceID'}['p'][0..n] = {'id': 'sentence'}
originals = {}
corrections = {}
script_name = (os.path.realpath(__file__))
script_dir = os.path.dirname(script_name)
def intersection(lst1, lst2):
if not lst1 or not len(lst1): return []
if not lst2 or not len(lst2): return []
return list(set(lst1) & set(lst2))
def get_data_from_ndiff(ndiff_list):
added = []
deleted = []
added_pos = []
deleted_pos = []
line_nr_new = -1
line_nr_old = -1
for line in ndiff_list:
token = line.rstrip()[2:]
if line.startswith('-'):
line_nr_old += 1
deleted.append(token)
deleted_pos.append(line_nr_old)
continue
if line.startswith('+'):
line_nr_new += 1
added.append(token)
added_pos.append(line_nr_new)
continue
if line.startswith('?'): continue
line_nr_old += 1
line_nr_new += 1
changed = True
if not len(added_pos) and not len(deleted_pos):
changed = False
return ({'added':added , 'added_pos':added_pos, 'deleted':deleted, 'deleted_pos':deleted_pos, 'changed':changed, 'pos_intersection':sorted(intersection(added_pos,deleted_pos ))})
filename = script_dir +'/../korpus_tsv/korpus.tsv'
#outdir = script_dir +'/korpus_xml'
file_input = open(filename, 'r')
for line in file_input:
line = line.rstrip()
line_arr = line.split('\t')
if not len(line_arr) == 2:
print ('ERROR :' , line)
exit(1)
uid = line_arr[0]
text = line_arr[1]
uid_arr = uid.split('_')
uid_ending = uid_arr.pop()
uid2 = '_'.join(uid_arr)
el_type = uid_ending[0]
if el_type == 'a':
originals[uid2] = {'id': uid, 'text':text.strip()}
if el_type == 'p':
if not uid2 in corrections:
corrections[uid2] = []
corrections[uid2].append({'id': uid, 'text':text.strip()})
collected_flags = {}
#paranduste klassifitseerimine
errorDecriptions = {
'tundmatu' : {'name':'Tundmatu', 'order':'1' }
, 'puudub' : {'name':'Parandus puudub', 'order':'1' }
, 'tühik' : {'name':'Tühik kirjavahemärgi ees v taga', 'order':'1' }
, 'punktuatsioon' : {'name':'Kirjavahemärk lisatud/eemaldatud', 'order':'1' }
, 'sõnadejärg' : {'name':'Sõnadejärjekord parandatud', 'order':'1' }
, 'sõnalisatud' : {'name':'Lisati puuduv sõna', 'order':'1' }
, 'sõnaeemaldatud' : {'name':'Kustutati sõna', 'order':'1' }
}
# alustame parandustest
d = Differ()
d2 = difflib.HtmlDiff( tabsize=4, wrapcolumn=40 )
stats = {}
stats['total'] = 0
stats['lahendamata1'] = 0
stats['lahendamata2'] = 0
filename2 = 'tulemus/tundmatu.html'
file_out = open(filename2, 'w')
file_out.write( """<html> <meta charset="UTF-8"><head></head>
<style>
table {
margin-top: 50px;
width: 1000px;
border-collapse: collapse;
}
table, th, td
{
border: 1px solid black;
vertical-align:top;
padding: 5px;
}
</style>
<body>
""")
file_out.write( '<h1>%s</h1>' % ('Tuvastamata veaga'))
file_out.write( '<table style="padding:5px">')
linenr = 0
for uid in sorted(corrections.keys()):
linenr +=1
if linenr > 100: continue
to_print = 0
correction_sets = []
for (i, correction) in enumerate(corrections[uid]):
stats['total'] += 1
flags = []
text1 = Text(originals[uid]['text'])
originals[uid]['tokenized'] = ' '.join(text1.word_texts)
text2 = Text(correction['text'])
#text2_words = text2.word_texts
corrections[uid][i]['tokenized'] = ' '.join(text2.word_texts)
ndiff_result = difflib.ndiff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1))
ndiff_result_list = list(ndiff_result)
unified_diff_result = difflib.unified_diff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1))
html_diff_result = []
for line in unified_diff_result:
if line.startswith('-'):
html_diff_result.append('<span style="color:red">%s</span>' % line)
elif line.startswith('+'):
html_diff_result.append('<span style="color:green">%s</span>' % line)
elif line.startswith('?'):
html_diff_result.append('<span style="color:gray">%s</span>' % line)
else:
html_diff_result.append('%s' % line)
corrections[uid][i]['unified_diff_result'] = list(unified_diff_result)
corrections[uid][i]['html_diff_result'] = html_diff_result
#algsed tööks vajalikud massiivid
#text1
#text2
#ndiff_result
#unified_diff_result
#et originaal säiliks töötame edasi koopiatega
#finished
finished = 0
text1_copy = Text(originals[uid]['text'])
text2_copy= Text(correction['text'])
text1_lemmas = text1_copy.lemmas
text2_lemmas = text2_copy.lemmas
text1_word_texts = text1_copy.word_texts
text2_word_texts = text2_copy.word_texts
text1_postags = text1_copy.postags
text2_postags = text2_copy.postags
text1_forms = text1_copy.forms
text2_forms = text2_copy.forms
ndiff_result_list_copy = copy.copy(ndiff_result_list)
added = []
deleted = []
added_pos = []
deleted_pos = []
# esiteks lihtne kontroll, et tokeniseerimata originaal ja tokeniseerimata parandus ei erine omavahel
if originals[uid]['text'] == correction['text']:
flags.append('puudub')
finished = 1
if not finished and originals[uid]['tokenized'] == correction['tokenized']:
flags.append('tühik')
finished = 1
#vaatame, kas kirjavahemärkidega tehti midagi
if not finished:
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
added_pos = [text1_postags[index] for index in ndiff_data['deleted_pos']]
deleted_pos = [text2_postags[index] for index in ndiff_data['added_pos']]
added_pos_uniq = list(set([text1_postags[index] for index in ndiff_data['deleted_pos']]))
deleted_pos_uniq = list(set([text2_postags[index] for index in ndiff_data['added_pos']]))
#print ('added_pos_uniq', added_pos_uniq)
#print ('deleted_pos_uniq', deleted_pos_uniq)
rowsets = {}
#muudatused on seatud ainult kirjavahemärkidega
if ''.join(added_pos_uniq) in ['Z', ''] and ''.join(deleted_pos_uniq) in ['Z', '']:
flags.append('punktuatsioon')
#correction_sets.append( {'type':'punktuatsioon', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
finished = 1
#muudatused on ka kirjavahemärkidega
elif 'Z' in added_pos + deleted_pos:
flags.append('punktuatsioon')
#teeme siin sellise sammu, kus eemaldame kõik kirjavahemärgid ja saame uuesti võrrelda
for (ind, pos) in reversed(list( enumerate(text1_postags))):
if pos == 'Z':
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
text1_postags.pop(ind)
text1_forms.pop(ind)
removed = []
for (ind, pos) in reversed(list( enumerate(text2_postags))):
if pos == 'Z':
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
text2_postags.pop(ind)
text2_forms.pop(ind)
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
#print ("+++++++")
#print ("".join(ndiff_result_list_copy))
ndiff_result_list_copy = list(new_ndiff_result)
if not len(ndiff_result_list_copy):
finished = 1
#print ("".join(ndiff_result_list_copy))
if not finished:
#sõnade järjekorra kontrollimine
# kui sõna on lause alguses, siis võrdleme seda lowercase
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
#keerame esimesed sõnad lowercase
if 0 in ndiff_data['added_pos']:
#print (ndiff_data['added'][0])
ndiff_data['added'][0] = ndiff_data['added'][0].lower()
#print (ndiff_data['added'][0])
if 0 in ndiff_data['deleted_pos']:
#print (ndiff_data['deleted'][0])
ndiff_data['deleted'][0] = ndiff_data['deleted'][0].lower()
#print (ndiff_data['deleted'][0])
if (sorted(ndiff_data['added']) == sorted(ndiff_data['deleted'])):
flags.append('sõnajärg')
correction_sets.append( {'type':'sõnajärg', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
finished = 1
elif intersection(ndiff_data['added'], ndiff_data['deleted']):
flags.append('sõnajärg')
deleted_elements_ind = []
added_elements_ind = []
#kustutame ära need sõnad, mis vahetasid asukohta
for token in intersection(ndiff_data['added'], ndiff_data['deleted']):
#kustutame ainult esimese esinemise
deleted_elements_ind.append(ndiff_data['deleted_pos'][ndiff_data['deleted'].index(token)])
added_elements_ind.append(ndiff_data['added_pos'][ndiff_data['added'].index(token)])
correction_sets.append( {'type':'sõnajärg', 'added': intersection(ndiff_data['added'], ndiff_data['deleted']), 'deleted': intersection(ndiff_data['added'], ndiff_data['deleted']) })
for ind in reversed(sorted(deleted_elements_ind)):
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
text1_postags.pop(ind)
text1_forms.pop(ind)
for ind in reversed(sorted(added_elements_ind)):
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
text2_postags.pop(ind)
text2_forms.pop(ind)
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
#print ("+++++++")
#print ("".join(ndiff_result_list_copy))
ndiff_result_list_copy = list(new_ndiff_result)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not ndiff_data['changed']:
finished = 1
#print ("".join(ndiff_result_list_copy))
if not finished:
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not len(ndiff_data['added']):
flags.append('sõnaüle')
finished =1
elif not len(ndiff_data['deleted']):
flags.append('sõnapuudu')
finished =1
if not finished:
if "".join(text1_word_texts) == "".join(text2_word_texts):
flags.append('kokku-lahku')
finished =1
else:
#otsin lisatud sõnade seas sõnu, mis on järjest positsioonidel ja mille kokkuliitmisel saab mõne kustutatud sõna
#otsin kustutatud sõnade seas sõnu, mis on järjest positsioonidel ja mille kokkuliitmisel saab mõne lisatud sõna
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
#added_pos = [text1_postags[index] for index in ndiff_data['deleted_pos']]
#deleted_pos = [text2_postags[index] for index in ndiff_data['added_pos']]
joined = {}
text1_word_texts_old = []
#kokku liidetud sõnad
while not text1_word_texts == text1_word_texts_old:
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result)
text1_word_texts_old = copy.copy(text1_word_texts)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
remove_added = []
remove_deleted = []
for pos in reversed(ndiff_data['deleted_pos']):
ind = ndiff_data['deleted_pos'].index(pos)
if pos+1 in ndiff_data['deleted_pos']:
joinedword = ndiff_data['deleted'][ind] + ndiff_data['deleted'][ind+1]
if joinedword in ndiff_data['added']:
#print (ndiff_data['deleted'], ndiff_data['added'])
remove_added.append(ndiff_data['added_pos'][ndiff_data['added'].index(joinedword)])
remove_deleted.append(pos)
remove_deleted.append(pos+1)
#print (remove_deleted, remove_added)
break
if len(remove_deleted):
flags.append('kokku-lahku')
for ind in reversed(sorted(remove_deleted)):
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
text1_postags.pop(ind)
text1_forms.pop(ind)
for ind in reversed(sorted(remove_added)):
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
text2_postags.pop(ind)
text2_forms.pop(ind)
text1_word_texts_old = copy.copy(text1_word_texts)
text1_word_texts_old = []
#lahku tõstetud sõnad
while not text1_word_texts == text1_word_texts_old:
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result)
text1_word_texts_old = copy.copy(text1_word_texts)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
remove_added = []
remove_deleted = []
for pos in reversed(ndiff_data['added_pos']):
ind = ndiff_data['added_pos'].index(pos)
if pos+1 in ndiff_data['added_pos']:
joinedword = ndiff_data['added'][ind] + ndiff_data['added'][ind+1]
if joinedword in ndiff_data['deleted']:
#print (ndiff_data['added'], ndiff_data['deleted'])
remove_deleted.append(ndiff_data['deleted_pos'][ndiff_data['deleted'].index(joinedword)])
remove_added.append(pos)
remove_added.append(pos+1)
#print (remove_added, remove_deleted)
break
if len(remove_deleted):
flags.append('kokku-lahku')
for ind in reversed(sorted(remove_deleted)):
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
text1_postags.pop(ind)
text1_forms.pop(ind)
for ind in reversed(sorted(remove_added)):
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
text2_postags.pop(ind)
text2_forms.pop(ind)
text1_word_texts_old = copy.copy(text1_word_texts)
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not ndiff_data['changed']:
finished = 1
if not finished:
#sama sõna muu vorm
if (text1_lemmas == text2_lemmas and text1_forms == text2_forms and ' '.join(text1_word_texts).lower()==' '.join(text2_word_texts).lower() ):
flags.append('suurväike')
finished =1
elif (text1_lemmas == text2_lemmas and text1_forms == text2_forms and not text1_word_texts==text2_word_texts ):
flags.append('paralleelvorm')
finished =1
else:
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
resolved_pos = []
casediff = 0
worddiff = 0
for token_pos in intersection(ndiff_data['added_pos'], ndiff_data['deleted_pos']):
if text1_lemmas[token_pos] == text2_lemmas[token_pos] and text1_forms[token_pos] == text2_forms[token_pos]:
if text1_word_texts[token_pos].lower() == text2_word_texts[token_pos].lower():
casediff +=1
else:
worddiff +=1
resolved_pos.append(token_pos)
if len (resolved_pos):
if casediff:
flags.append('suurväike')
if worddiff:
flags.append('paralleelvorm')
for ind in reversed(sorted(resolved_pos)):
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
text1_postags.pop(ind)
text1_forms.pop(ind)
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
text2_postags.pop(ind)
text2_forms.pop(ind)
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not ndiff_data['changed']:
finished = 1
if not finished:
#sama sõna muu vorm
#vaatas kogu teksti
if (text1_lemmas == text2_lemmas and not text1_word_texts==text2_word_texts ):
flags.append('valevorm')
finished =1
#vaatame samal positsioonil asuvaid asendusi
else:
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
resolved_pos = []
for token_pos in intersection(ndiff_data['added_pos'], ndiff_data['deleted_pos']):
if len(intersection(text1_lemmas[token_pos].split('|'), text2_lemmas[token_pos].split('|'))):
# if text1_lemmas[token_pos] == text2_lemmas[token_pos]:
resolved_pos.append(token_pos)
if len (resolved_pos):
flags.append('valevorm')
for ind in reversed(sorted(resolved_pos)):
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
text1_postags.pop(ind)
text1_forms.pop(ind)
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
text2_postags.pop(ind)
text2_forms.pop(ind)
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not ndiff_data['changed']:
finished = 1
if not finished:
#sama vorm muu sõna
if (text1_forms == text2_forms and not text1_lemmas==text2_lemmas ):
flags.append('valelemma')
finished =1
#vaatame samal positsioonil asuvaid asendusi
else:
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
resolved_pos = []
for token_pos in intersection(ndiff_data['added_pos'], ndiff_data['deleted_pos']):
sub_type = ''
if text1_forms[token_pos] == text2_forms[token_pos] and not text1_lemmas[token_pos]==text2_lemmas[token_pos]:
resolved_pos.append(token_pos)
elif intersection(text1_forms[token_pos].split('|'), text2_forms[token_pos].split('|')) and not text1_lemmas[token_pos]==text2_lemmas[token_pos]:
sub_type = '2'
resolved_pos.append(token_pos)
elif len(intersection(text1_forms[token_pos].split('|'), text2_forms[token_pos].split('|'))) and not len(intersection( text1_lemmas[token_pos].split('|'), text2_lemmas[token_pos].split('|'))):
sub_type = '3'
resolved_pos.append(token_pos)
if len (resolved_pos):
flags.append('valelemma'+sub_type)
for ind in reversed(sorted(resolved_pos)):
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
text1_postags.pop(ind)
text1_forms.pop(ind)
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
text2_postags.pop(ind)
text2_forms.pop(ind)
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not ndiff_data['changed']:
finished = 1
if not finished:
#nud<->nu tud<->tu nudtud
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
resolved_pos = []
for token_pos in intersection(ndiff_data['added_pos'], ndiff_data['deleted_pos']):
if ( 'nud' in text1_forms[token_pos].split('|') or 'tud' in text1_forms[token_pos].split('|') ) and len(intersection(text1_lemmas[token_pos].split('|'), text2_lemmas[token_pos].split('|')) ):
resolved_pos.append(token_pos)
if len (resolved_pos):
#print('here')
flags.append('nudtud')
for ind in reversed(sorted(resolved_pos)):
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
text1_postags.pop(ind)
text1_forms.pop(ind)
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
text2_postags.pop(ind)
text2_forms.pop(ind)
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))