#/usr/bin/python # -*- coding: utf-8 -*- import os import sys import re import difflib import copy from estnltk import Text from difflib import Differ #proovime mõned lihtsamad vead tuvastada #Lähtefail ../korpus_tsv #loeme kõik laused sisse, teeme dict #item{'senetenceID'}['a'] = {'id': 'sentence'} #item{'senetenceID'}['p'][0..n] = {'id': 'sentence'} originals = {} corrections = {} script_name = (os.path.realpath(__file__)) script_dir = os.path.dirname(script_name) def intersection(lst1, lst2): if not lst1 or not len(lst1): return [] if not lst2 or not len(lst2): return [] return list(set(lst1) & set(lst2)) def get_data_from_ndiff(ndiff_list): added = [] deleted = [] added_pos = [] deleted_pos = [] line_nr_new = -1 line_nr_old = -1 for line in ndiff_list: token = line.rstrip()[2:] if line.startswith('-'): line_nr_old += 1 deleted.append(token) deleted_pos.append(line_nr_old) continue if line.startswith('+'): line_nr_new += 1 added.append(token) added_pos.append(line_nr_new) continue if line.startswith('?'): continue line_nr_old += 1 line_nr_new += 1 changed = True if not len(added_pos) and not len(deleted_pos): changed = False return ({'added':added , 'added_pos':added_pos, 'deleted':deleted, 'deleted_pos':deleted_pos, 'changed':changed, 'pos_intersection':sorted(intersection(added_pos,deleted_pos ))}) filename = script_dir +'/../korpus_tsv/korpus.tsv' #outdir = script_dir +'/korpus_xml' file_input = open(filename, 'r') for line in file_input: line = line.rstrip() line_arr = line.split('\t') if not len(line_arr) == 2: print ('ERROR :' , line) exit(1) uid = line_arr[0] text = line_arr[1] uid_arr = uid.split('_') uid_ending = uid_arr.pop() uid2 = '_'.join(uid_arr) el_type = uid_ending[0] if el_type == 'a': originals[uid2] = {'id': uid, 'text':text.strip()} if el_type == 'p': if not uid2 in corrections: corrections[uid2] = [] corrections[uid2].append({'id': uid, 'text':text.strip()}) collected_flags = {} #paranduste klassifitseerimine errorDecriptions = { 'tundmatu' : {'name':'Tundmatu', 'order':'1' } , 'puudub' : {'name':'Parandus puudub', 'order':'1' } , 'tühik' : {'name':'Tühik kirjavahemärgi ees v taga', 'order':'1' } , 'punktuatsioon' : {'name':'Kirjavahemärk lisatud/eemaldatud', 'order':'1' } , 'sõnadejärg' : {'name':'Sõnadejärjekord parandatud', 'order':'1' } , 'sõnalisatud' : {'name':'Lisati puuduv sõna', 'order':'1' } , 'sõnaeemaldatud' : {'name':'Kustutati sõna', 'order':'1' } } # alustame parandustest d = Differ() d2 = difflib.HtmlDiff( tabsize=4, wrapcolumn=40 ) stats = {} stats['total'] = 0 stats['lahendamata1'] = 0 stats['lahendamata2'] = 0 filename2 = 'tulemus/tundmatu.html' file_out = open(filename2, 'w') file_out.write( """ """) file_out.write( '

%s

' % ('Tuvastamata veaga')) file_out.write( '') linenr = 0 for uid in sorted(corrections.keys()): linenr +=1 #if linenr > 1000: continue to_print = 0 for (i, correction) in enumerate(corrections[uid]): correction_sets = [] stats['total'] += 1 flags = [] text1 = Text(originals[uid]['text']) originals[uid]['tokenized'] = ' '.join(text1.word_texts) text2 = Text(correction['text']) #text2_words = text2.word_texts corrections[uid][i]['tokenized'] = ' '.join(text2.word_texts) ndiff_result = difflib.ndiff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1)) ndiff_result_list = list(ndiff_result) unified_diff_result = difflib.unified_diff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1)) html_diff_result = [] for line in unified_diff_result: if line.startswith('-'): html_diff_result.append('%s' % line) elif line.startswith('+'): html_diff_result.append('%s' % line) elif line.startswith('?'): html_diff_result.append('%s' % line) else: html_diff_result.append('%s' % line) corrections[uid][i]['unified_diff_result'] = list(unified_diff_result) corrections[uid][i]['html_diff_result'] = html_diff_result #algsed tööks vajalikud massiivid #text1 #text2 #ndiff_result #unified_diff_result #et originaal säiliks töötame edasi koopiatega #finished finished = 0 text1_copy = Text(originals[uid]['text']) text2_copy= Text(correction['text']) text1_lemmas = text1_copy.lemmas text2_lemmas = text2_copy.lemmas text1_word_texts = text1_copy.word_texts text2_word_texts = text2_copy.word_texts text1_postags = text1_copy.postags text2_postags = text2_copy.postags text1_forms = text1_copy.forms text2_forms = text2_copy.forms ndiff_result_list_copy = copy.copy(ndiff_result_list) added = [] deleted = [] added_pos = [] deleted_pos = [] # esiteks lihtne kontroll, et tokeniseerimata originaal ja tokeniseerimata parandus ei erine omavahel if originals[uid]['text'] == correction['text']: flags.append('puudub') finished = 1 if not finished and originals[uid]['tokenized'] == correction['tokenized']: flags.append('tühik') finished = 1 #vaatame, kas kirjavahemärkidega tehti midagi if not finished: ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) added_pos = [text1_postags[index] for index in ndiff_data['deleted_pos']] deleted_pos = [text2_postags[index] for index in ndiff_data['added_pos']] added_pos_uniq = list(set([text1_postags[index] for index in ndiff_data['deleted_pos']])) deleted_pos_uniq = list(set([text2_postags[index] for index in ndiff_data['added_pos']])) #print ('added_pos_uniq', added_pos_uniq) #print ('deleted_pos_uniq', deleted_pos_uniq) rowsets = {} #muudatused on seatud ainult kirjavahemärkidega if ''.join(added_pos_uniq) in ['Z', ''] and ''.join(deleted_pos_uniq) in ['Z', '']: flags.append('punktuatsioon') #correction_sets.append( {'type':'punktuatsioon', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] }) finished = 1 #muudatused on ka kirjavahemärkidega elif 'Z' in added_pos + deleted_pos: flags.append('punktuatsioon') #teeme siin sellise sammu, kus eemaldame kõik kirjavahemärgid ja saame uuesti võrrelda for (ind, pos) in reversed(list( enumerate(text1_postags))): if pos == 'Z': text1_lemmas.pop(ind) text1_word_texts.pop(ind) text1_postags.pop(ind) text1_forms.pop(ind) removed = [] for (ind, pos) in reversed(list( enumerate(text2_postags))): if pos == 'Z': text2_lemmas.pop(ind) text2_word_texts.pop(ind) text2_postags.pop(ind) text2_forms.pop(ind) new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) #print ("+++++++") #print ("".join(ndiff_result_list_copy)) ndiff_result_list_copy = list(new_ndiff_result) if not len(ndiff_result_list_copy): finished = 1 #print ("".join(ndiff_result_list_copy)) if not finished: #sõnade järjekorra kontrollimine # kui sõna on lause alguses, siis võrdleme seda lowercase ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) #keerame esimesed sõnad lowercase if 0 in ndiff_data['added_pos']: #print (ndiff_data['added'][0]) ndiff_data['added'][0] = ndiff_data['added'][0].lower() #print (ndiff_data['added'][0]) if 0 in ndiff_data['deleted_pos']: #print (ndiff_data['deleted'][0]) ndiff_data['deleted'][0] = ndiff_data['deleted'][0].lower() #print (ndiff_data['deleted'][0]) if (sorted(ndiff_data['added']) == sorted(ndiff_data['deleted'])): flags.append('sõnajärg') correction_sets.append( {'type':'sõnajärg', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] }) finished = 1 elif intersection(ndiff_data['added'], ndiff_data['deleted']): flags.append('sõnajärg') deleted_elements_ind = [] added_elements_ind = [] #kustutame ära need sõnad, mis vahetasid asukohta for token in intersection(ndiff_data['added'], ndiff_data['deleted']): #kustutame ainult esimese esinemise deleted_elements_ind.append(ndiff_data['deleted_pos'][ndiff_data['deleted'].index(token)]) added_elements_ind.append(ndiff_data['added_pos'][ndiff_data['added'].index(token)]) deleted_log = [] for ind in reversed(sorted(deleted_elements_ind)): text1_lemmas.pop(ind) deleted_log.insert(0, text1_word_texts.pop(ind)) text1_postags.pop(ind) text1_forms.pop(ind) added_log = [] for ind in reversed(sorted(added_elements_ind)): text2_lemmas.pop(ind) added_log.insert(0, text2_word_texts.pop(ind)) text2_postags.pop(ind) text2_forms.pop(ind) correction_sets.append( {'type':'sõnajärg', 'added': added_log, 'deleted': deleted_log }) new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) #print ("+++++++") #print ("".join(ndiff_result_list_copy)) ndiff_result_list_copy = list(new_ndiff_result) ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) if not ndiff_data['changed']: finished = 1 #print ("".join(ndiff_result_list_copy)) if not finished: ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) if not len(ndiff_data['added']): flags.append('sõnaüle') correction_sets.append( {'type':'sõnaüle', 'added': [], 'deleted': ndiff_data['deleted'] }) finished =1 elif not len(ndiff_data['deleted']): flags.append('sõnapuudu') correction_sets.append( {'type':'sõnapuudu', 'added': ndiff_data['added'], 'deleted': [] }) finished =1 ################## # kokku-lahku ################## if not finished: ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) if "".join(text1_word_texts) == "".join(text2_word_texts): flags.append('kokku-lahku') correction_sets.append( {'type':'kokku-lahku', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] }) finished =1 else: #otsin lisatud sõnade seas sõnu, mis on järjest positsioonidel ja mille kokkuliitmisel saab mõne kustutatud sõna #otsin kustutatud sõnade seas sõnu, mis on järjest positsioonidel ja mille kokkuliitmisel saab mõne lisatud sõna #added_pos = [text1_postags[index] for index in ndiff_data['deleted_pos']] #deleted_pos = [text2_postags[index] for index in ndiff_data['added_pos']] joined = {} text1_word_texts_old = [] #kokku liidetud sõnad while not text1_word_texts == text1_word_texts_old: new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) ndiff_result_list_copy = list(new_ndiff_result) text1_word_texts_old = copy.copy(text1_word_texts) ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) remove_added = [] remove_deleted = [] for pos in reversed(ndiff_data['deleted_pos']): ind = ndiff_data['deleted_pos'].index(pos) if pos+1 in ndiff_data['deleted_pos']: joinedword = ndiff_data['deleted'][ind] + ndiff_data['deleted'][ind+1] if joinedword in ndiff_data['added']: #print (ndiff_data['deleted'], ndiff_data['added']) remove_added.append(ndiff_data['added_pos'][ndiff_data['added'].index(joinedword)]) remove_deleted.append(pos) remove_deleted.append(pos+1) #print (remove_deleted, remove_added) break if len(remove_deleted): flags.append('kokku-lahku') deleted_log = [] for ind in reversed(sorted(remove_deleted)): text1_lemmas.pop(ind) deleted_log.insert(0, text1_word_texts.pop(ind)) text1_postags.pop(ind) text1_forms.pop(ind) added_log = [] for ind in reversed(sorted(remove_added)): text2_lemmas.pop(ind) added_log.insert(0, text2_word_texts.pop(ind)) text2_postags.pop(ind) text2_forms.pop(ind) correction_sets.append( {'type':'kokku-lahku', 'added': added_log, 'deleted': deleted_log }) text1_word_texts_old = copy.copy(text1_word_texts) text1_word_texts_old = [] #lahku tõstetud sõnad while not text1_word_texts == text1_word_texts_old: new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) ndiff_result_list_copy = list(new_ndiff_result) text1_word_texts_old = copy.copy(text1_word_texts) ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) remove_added = [] remove_deleted = [] for pos in reversed(ndiff_data['added_pos']): ind = ndiff_data['added_pos'].index(pos) if pos+1 in ndiff_data['added_pos']: joinedword = ndiff_data['added'][ind] + ndiff_data['added'][ind+1] if joinedword in ndiff_data['deleted']: #print (ndiff_data['added'], ndiff_data['deleted']) remove_deleted.append(ndiff_data['deleted_pos'][ndiff_data['deleted'].index(joinedword)]) remove_added.append(pos) remove_added.append(pos+1) #print (remove_added, remove_deleted) break if len(remove_deleted): flags.append('kokku-lahku') deleted_log = [] for ind in reversed(sorted(remove_deleted)): text1_lemmas.pop(ind) deleted_log.insert(0, text1_word_texts.pop(ind)) text1_postags.pop(ind) text1_forms.pop(ind) added_log = [] for ind in reversed(sorted(remove_added)): text2_lemmas.pop(ind) added_log.insert(0, text2_word_texts.pop(ind)) text2_postags.pop(ind) text2_forms.pop(ind) correction_sets.append( {'type':'kokku-lahku', 'added': added_log, 'deleted': deleted_log }) text1_word_texts_old = copy.copy(text1_word_texts) new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) ndiff_result_list_copy = list(new_ndiff_result) ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) if not ndiff_data['changed']: finished = 1 ############### # suurväike # paralleelvorm ############### if not finished: #sama sõna muu vorm ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) if (text1_lemmas == text2_lemmas and text1_forms == text2_forms and ' '.join(text1_word_texts).lower()==' '.join(text2_word_texts).lower() ): flags.append('suurväike') correction_sets.append( {'type':'suurväike', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] }) finished =1 elif (text1_lemmas == text2_lemmas and text1_forms == text2_forms and not text1_word_texts==text2_word_texts ): flags.append('paralleelvorm') correction_sets.append( {'type':'paralleelvorm', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] }) finished =1 else: resolved_pos = [] casediff = 0 worddiff = 0 for token_pos in intersection(ndiff_data['added_pos'], ndiff_data['deleted_pos']): if text1_lemmas[token_pos] == text2_lemmas[token_pos] and text1_forms[token_pos] == text2_forms[token_pos]: if text1_word_texts[token_pos].lower() == text2_word_texts[token_pos].lower(): casediff +=1 else: worddiff +=1 resolved_pos.append(token_pos) deleted_log = [] added_log = [] for ind in reversed(sorted(resolved_pos)): text1_lemmas.pop(ind) deleted_log.insert(0, text1_word_texts.pop(ind)) text1_postags.pop(ind) text1_forms.pop(ind) text2_lemmas.pop(ind) added_log.insert(0, text2_word_texts.pop(ind)) text2_postags.pop(ind) text2_forms.pop(ind) if len (resolved_pos): if casediff: flags.append('suurväike') correction_sets.append( {'type':'suurväike', 'added': added_log, 'deleted':deleted_log}) if worddiff: flags.append('paralleelvorm') correction_sets.append( {'type':'paralleelvorm', 'added': added_log, 'deleted': deleted_log}) new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) ndiff_result_list_copy = list(new_ndiff_result) ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) if not ndiff_data['changed']: finished = 1 ################### # valevorm #################### if not finished: #sama sõna muu vorm ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) #vaatas kogu teksti if (text1_lemmas == text2_lemmas and not text1_word_texts==text2_word_texts ): flags.append('valevorm') correction_sets.append( {'type':'valevorm', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] }) finished =1 #vaatame samal positsioonil asuvaid asendusi else: resolved_pos = [] for token_pos in ndiff_data['pos_intersection']: if len(intersection(text1_lemmas[token_pos].split('|'), text2_lemmas[token_pos].split('|'))): # if text1_lemmas[token_pos] == text2_lemmas[token_pos]: resolved_pos.append(token_pos) deleted_log = [] added_log = [] for ind in reversed(sorted(resolved_pos)): text1_lemmas.pop(ind) deleted_log.insert(0, text1_word_texts.pop(ind)) text1_postags.pop(ind) text1_forms.pop(ind) text2_lemmas.pop(ind) added_log.insert(0, text2_word_texts.pop(ind)) text2_postags.pop(ind) text2_forms.pop(ind) if len (resolved_pos): flags.append('valevorm') correction_sets.append( {'type':'valevorm', 'added': added_log, 'deleted': deleted_log}) new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) ndiff_result_list_copy = list(new_ndiff_result) ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) if not ndiff_data['changed']: finished = 1 ################### # valelemma #################### if not finished: #sama vorm muu sõna ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) if (text1_forms == text2_forms and not text1_lemmas==text2_lemmas ): flags.append('valelemma') correction_sets.append( {'type':'valelemma', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] }) finished =1 #vaatame samal positsioonil asuvaid asendusi else: resolved_pos = [] for token_pos in ndiff_data['pos_intersection']: sub_type = '' if text1_forms[token_pos] == text2_forms[token_pos] and not text1_lemmas[token_pos]==text2_lemmas[token_pos]: resolved_pos.append(token_pos) elif intersection(text1_forms[token_pos].split('|'), text2_forms[token_pos].split('|')) and not text1_lemmas[token_pos]==text2_lemmas[token_pos]: sub_type = '2' resolved_pos.append(token_pos) elif len(intersection(text1_forms[token_pos].split('|'), text2_forms[token_pos].split('|'))) and not len(intersection( text1_lemmas[token_pos].split('|'), text2_lemmas[token_pos].split('|'))): sub_type = '3' resolved_pos.append(token_pos) deleted_log = [] added_log = [] for ind in reversed(sorted(resolved_pos)): text1_lemmas.pop(ind) deleted_log.insert(0, text1_word_texts.pop(ind)) text1_postags.pop(ind) text1_forms.pop(ind) text2_lemmas.pop(ind) added_log.insert(0, text2_word_texts.pop(ind)) text2_postags.pop(ind) text2_forms.pop(ind) if len (resolved_pos): flags.append('valelemma') correction_sets.append( {'type':'valelemma'+sub_type, 'added': added_log, 'deleted': deleted_log}) new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) ndiff_result_list_copy = list(new_ndiff_result) ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) if not ndiff_data['changed']: finished = 1 if not finished: flags.append('0') unified_remained_diff_result = difflib.unified_diff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) html_diff_result = [] for line in unified_remained_diff_result: if line.startswith('-'): html_diff_result.append('%s' % line) elif line.startswith('+'): html_diff_result.append('%s' % line) elif line.startswith('?'): html_diff_result.append('%s' % line) else: html_diff_result.append('%s' % line) corrections[uid][i]['remained_diff'] = html_diff_result if finished : corrections[uid][i]['remained_diff'] = '' # tokeniseertud originaal ja tokeniseerimata originaal ei erine omavahel # esimene, mida kontrollime on lisatud v eemaldatud kirjavahemärkide olemasolu #, kui leiame, et need on olemas, siis võtame diff tulemusest ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) if len(flags)==1 and flags[0] == '0' and ndiff_data['added_pos'] == ndiff_data['deleted_pos']: flags=['00'] corrections[uid][i]['correction_sets'] = correction_sets corrections[uid][i]['flags'] = flags corrections[uid][i]['flags_label'] = "_".join(sorted(flags)) if corrections[uid][i]['flags_label'] == '0' or corrections[uid][i]['flags_label'] == '00': corrections[uid][i]['remained_diff'] = '' rows_html = '' rows_html += '\n' %( correction['id']) rows_html += '\n' %( uid, originals[uid]['text'], originals[uid]['tokenized'], "
".join(correction['html_diff_result']) ) rows_html += '\n' %( correction['id'], correction['text'], correction['tokenized']) rows_html += '\n' kkey = "_".join(sorted(flags)) if not kkey in collected_flags: collected_flags[kkey] = 0 collected_flags[kkey] += 1 #print (flags, correction['text']) if '0' in flags and kkey == '0': stats['lahendamata1'] +=1 elif '0' in flags: stats['lahendamata2'] +=1 file_out.write('
 Võrdlus:
%s%s%s%s
%s%s%s
 
') file_out.write ('') file_out.close() for label in collected_flags: #juba on olemas selline fail #if label == 'tundmatu': continue print (label) filename2 = 'tulemus/%s.html' % label file_out = open(filename2, 'w') file_out.write( ' ') file_out.write( """ """) file_out.write( '

%s (%d)

' % (label,collected_flags[label])) file_out.write( '

Kontrolliti %d parandust

' % (stats['total'])) file_out.write( '

Viga tuvastatud: %d

' % (stats['total']- stats['lahendamata1']-stats['lahendamata2'])) file_out.write( '

Viga tuvastamata: %d

' % (stats['lahendamata1'])) file_out.write( '

Viga osaliselt tuvastamata: %d

' % (stats['lahendamata2'])) for label2 in sorted(collected_flags): file_out.write( '%s (%d)
' % (label2,label2,collected_flags[label2])) file_out.write( '') for uid in sorted(corrections.keys()): for (i, correction) in enumerate(corrections[uid]): if not 'flags' in correction : continue if not len (correction['flags']): continue if not label == correction['flags_label']: continue rows_html = '' rows_html += '\n' %( label, correction['id']) rows_html += '\n' %( uid, originals[uid]['text'], originals[uid]['tokenized'], "
".join(correction['html_diff_result']) + '
' + "
".join(correction['remained_diff'])) rows_html += '\n' %( correction['id'], correction['text'], correction['tokenized']) for corr_set in corrections[uid][i]['correction_sets']: rows_html += '\n' % (corr_set['type'], ' '.join(corr_set['deleted']) , ' '.join(corr_set['added'] ) ) rows_html += '\n' file_out.write(rows_html) file_out.write('
 %sVõrdlus:
%s%s%s%s
%s%s%s
 %s%s ---> %s
 
') file_out.write ('') file_out.close() exit()