Commit 63cee48b authored by rabauti's avatar rabauti
Browse files

parandused

parent 807ca65a
......@@ -7,9 +7,10 @@ import sys
import re
import difflib
import copy
from datetime import datetime
from estnltk import Text
from difflib import Differ
from difflib import Differ
#proovime mõned lihtsamad vead tuvastada
#Lähtefail ../korpus_tsv
......@@ -113,9 +114,11 @@ d2 = difflib.HtmlDiff( tabsize=4, wrapcolumn=40 )
stats = {}
stats['total'] = 0
stats['lahendatud'] = 0
stats['lahendamata1'] = 0
stats['lahendamata2'] = 0
filename2 = 'tulemus/tundmatu.html'
file_out = open(filename2, 'w')
......@@ -142,7 +145,7 @@ linenr = 0
for uid in sorted(corrections.keys()):
linenr +=1
if linenr > 100: continue
#if linenr > 100: continue
to_print = 0
......@@ -154,12 +157,17 @@ for uid in sorted(corrections.keys()):
flags = []
text1 = Text(originals[uid]['text'])
originals[uid]['tokenized'] = ' '.join(text1.word_texts)
originals[uid]['tokenized'] = ' '.join(text1.word_texts)
text2 = Text(correction['text'])
#text2_words = text2.word_texts
corrections[uid][i]['tokenized'] = ' '.join(text2.word_texts)
corrections[uid][i]['tokenized'] = ' '.join(text2.word_texts)
ndiff_result = difflib.ndiff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1))
ndiff_result_list = list(ndiff_result)
unified_diff_result = difflib.unified_diff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1))
......@@ -206,7 +214,25 @@ for uid in sorted(corrections.keys()):
text2_forms = text2_copy.forms
#lisame lõppu nn saba, muidu viimase sõna diff võib valeks minna
text1_lemmas.append('####')
text2_lemmas.append('####')
text1_word_texts.append('####')
text2_word_texts.append('####')
text1_postags.append('####')
text2_postags.append('####')
text1_forms.append('####')
text2_forms.append('####')
ndiff_result = difflib.ndiff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1))
ndiff_result_list = list(ndiff_result)
ndiff_result_list_copy = copy.copy(ndiff_result_list)
added = []
......@@ -216,7 +242,7 @@ for uid in sorted(corrections.keys()):
deleted_pos = []
#print (ndiff_result_list)
# esiteks lihtne kontroll, et tokeniseerimata originaal ja tokeniseerimata parandus ei erine omavahel
if originals[uid]['text'] == correction['text']:
......@@ -268,7 +294,7 @@ for uid in sorted(corrections.keys()):
text2_postags.pop(ind)
text2_forms.pop(ind)
correction_sets.append( {'type':'punktuatsioon', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
correction_sets.append( {'type':'punktuatsioon', 'added': added_log, 'deleted': deleted_log })
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
#print ("+++++++")
#print ("".join(ndiff_result_list_copy))
......@@ -276,7 +302,10 @@ for uid in sorted(corrections.keys()):
if not len(ndiff_result_list_copy):
finished = 1
#print ("".join(ndiff_result_list_copy))
###########################
# sõnaasukoht
###########################
if not finished:
#sõnade järjekorra kontrollimine
# kui sõna on lause alguses, siis võrdleme seda lowercase
......@@ -294,13 +323,13 @@ for uid in sorted(corrections.keys()):
#print (ndiff_data['deleted'][0])
if (sorted(ndiff_data['added']) == sorted(ndiff_data['deleted'])):
flags.append('sõnajärg')
correction_sets.append( {'type':'sõnajärg', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
flags.append('sõnaasukoht')
correction_sets.append( {'type':'sõnaasukoht', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
finished = 1
elif intersection(ndiff_data['added'], ndiff_data['deleted']):
flags.append('sõnajärg')
flags.append('sõnaasukoht')
deleted_elements_ind = []
added_elements_ind = []
......@@ -326,7 +355,7 @@ for uid in sorted(corrections.keys()):
text2_postags.pop(ind)
text2_forms.pop(ind)
correction_sets.append( {'type':'sõnajärg', 'added': added_log, 'deleted': deleted_log })
correction_sets.append( {'type':'sõnaasukoht', 'added': added_log, 'deleted': deleted_log })
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
#print ("+++++++")
......@@ -491,7 +520,7 @@ for uid in sorted(corrections.keys()):
casediff = 0
worddiff = 0
for token_pos in intersection(ndiff_data['added_pos'], ndiff_data['deleted_pos']):
for token_pos in ndiff_data['pos_intersection']:
if text1_lemmas[token_pos] == text2_lemmas[token_pos] and text1_forms[token_pos] == text2_forms[token_pos]:
if text1_word_texts[token_pos].lower() == text2_word_texts[token_pos].lower():
casediff +=1
......@@ -499,25 +528,37 @@ for uid in sorted(corrections.keys()):
worddiff +=1
resolved_pos.append(token_pos)
deleted_log = []
added_log = []
deleted_log_worddiff = []
added_log_worddiff = []
deleted_log_casediff = []
added_log_casediff = []
for ind in reversed(sorted(resolved_pos)):
text1_lemmas.pop(ind)
deleted_log.insert(0, text1_word_texts.pop(ind))
if text1_word_texts[ind].lower() == text2_word_texts[ind].lower():
deleted_log_casediff.insert(0, text1_word_texts.pop(ind))
added_log_casediff.insert(0, text2_word_texts.pop(ind))
else:
deleted_log_worddiff.insert(0, text1_word_texts.pop(ind))
added_log_worddiff.insert(0, text2_word_texts.pop(ind))
text1_postags.pop(ind)
text1_forms.pop(ind)
text2_lemmas.pop(ind)
added_log.insert(0, text2_word_texts.pop(ind))
text2_postags.pop(ind)
text2_forms.pop(ind)
if len (resolved_pos):
if casediff:
if len(added_log_casediff):
flags.append('suurväike')
correction_sets.append( {'type':'suurväike', 'added': added_log, 'deleted':deleted_log})
if worddiff:
correction_sets.append( {'type':'suurväike', 'added': added_log_casediff, 'deleted':deleted_log_casediff})
#print (ndiff_result_list_copy)
#print (added_log_casediff)
if len(deleted_log_worddiff):
flags.append('paralleelvorm')
correction_sets.append( {'type':'paralleelvorm', 'added': added_log, 'deleted': deleted_log})
correction_sets.append( {'type':'paralleelvorm', 'added': added_log_worddiff, 'deleted': deleted_log_worddiff})
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result)
......@@ -624,13 +665,25 @@ for uid in sorted(corrections.keys()):
if not ndiff_data['changed']:
finished = 1
if not finished:
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not len(ndiff_data['added']):
flags.append('sõnaüle')
correction_sets.append( {'type':'sõnaüle', 'added': [], 'deleted': ndiff_data['deleted'] })
finished =1
elif not len(ndiff_data['deleted']):
flags.append('sõnapuudu')
correction_sets.append( {'type':'sõnapuudu', 'added': ndiff_data['added'], 'deleted': [] })
finished =1
if not finished:
flags.append('0')
unified_remained_diff_result = difflib.unified_diff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
html_diff_result = []
for line in unified_remained_diff_result:
if line.startswith('-'):
......@@ -641,13 +694,15 @@ for uid in sorted(corrections.keys()):
html_diff_result.append('<span style="color:gray">%s</span>' % line)
else:
html_diff_result.append('%s' % line)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
correction_sets.append( {'type':'lahendamata', 'added':ndiff_data['added'], 'deleted': ndiff_data['deleted']})
corrections[uid][i]['remained_diff'] = html_diff_result
if finished :
corrections[uid][i]['remained_diff'] = ''
stats['lahendatud'] += 1
# tokeniseertud originaal ja tokeniseerimata originaal ei erine omavahel
......@@ -689,8 +744,8 @@ for uid in sorted(corrections.keys()):
if '0' in flags and kkey in ('0', '00'):
stats['lahendamata1'] +=1
elif '0' in flags:
stats['lahendamata1'] +=1
stats['lahendamata2'] +=1
file_out.write('</table>')
file_out.write ('</body></html>')
file_out.close()
......@@ -727,12 +782,13 @@ for label in collected_flags:
<body>
""")
file_out.write( '<h5>%s</h5>' % str(datetime.now()))
file_out.write( '<h1>%s (%d)</h1>' % (label,collected_flags[label]))
file_out.write( '<h3>Kontrolliti %d parandust</h3>' % (stats['total']))
file_out.write( '<h3>Viga tuvastatud: %d</h3>' % (stats['total']- stats['lahendamata1']-stats['lahendamata2']))
file_out.write( '<h3>Viga tuvastatud: %d</h3>' % (stats['lahendatud']))
file_out.write( '<h3>Viga tuvastamata: %d (osaliselt tuvastatud %d)</h3>' % (stats['lahendamata1'] + stats['lahendamata2'] , stats['lahendamata2']))
file_out.write( '<h4>Kirjeldus <a target="_new" href="%s.html">#:</a></h4>')
file_out.write( '<h4>Kirjeldus <a target="_new" href="kirjeldus.html">#:</a></h4>')
for label2 in sorted(collected_flags):
file_out.write( '<span>%s <a href="%s.html">(%d)</a></span><br/>' % (label2,label2,collected_flags[label2]))
......@@ -762,7 +818,7 @@ for label in collected_flags:
file_out.write(rows_html)
file_out.write(rows_html)
......@@ -772,10 +828,22 @@ for label in collected_flags:
#paranduste klassifitseerimine
errorDecriptions = {
'sõnadejärg' : {
'0' : {
'order':'1',
'rows': [
[ '0', 'Tuvastamata.'],
[ '00', 'Tuvastamata. Parandused ja eksimused on lausetes kohakuti. ']
] },
'sõnaasukoht' : {
'order':'1',
'rows': [
[ 'sõnadejärg', 'Sõna asukoht lauses on muutunud. Sõnakuju (va. väike-suurtäht lause alguses) pole muutunud. Sõnade morfanalüüsi ei vaadata.']
[ 'sõnaasukoht', 'Sõna asukoht lauses on muutunud. Sõnakuju (va. väike-suurtäht lause alguses) pole muutunud. Sõnade morfanalüüsi ei vaadata.']
] },
'suur-väike' : {
'order':'1',
'rows': [
[ 'suur-väike', 'Sõnas muutusid suurtähed v väiketähed.']
] },
'punktuatsioon' : {
......@@ -787,7 +855,7 @@ errorDecriptions = {
'kokku-lahku' : {
'order':'3',
'rows': [
[ 'kokku-lahku', 'Lauses on muudetud (lisatud, eemaldatud, asendatud) mõnda kirjavahemärki.']
[ 'kokku-lahku', 'Parandatud on sõna kokku-lahku kirjutamist.']
]},
'valelemma' : {
......@@ -813,7 +881,7 @@ errorDecriptions = {
'sõnaüle' : {
'order':'5',
'rows': [
[ 'sõnaüle', 'Lauses oli sõna üle.']
[ 'sõnaüle', 'Lauses oli üleliigne sõna.']
]},
}
......@@ -823,10 +891,12 @@ filename2 = 'tulemus/kirjeldus.html'
file_out = open(filename2, 'w')
file_out.write( '<html> <meta charset="UTF-8"><head></head>')
file_out.write( "<body>")
file_out.write( '<h5>%s</h5>' % str(datetime.now()))
file_out.write( '<h1>Tüübid</h1>')
for key in sorted(errorDecriptions.keys()):
#file_out.write( '<h2>%s</h2>' % key)
for row in errorDecriptions[key]['rows']:
file_out.write( '<p><b>%s</b><p><p style="padding-left:20px">%s</p>' % (row[0] , row[1]))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment