Commit 63cee48b authored by rabauti's avatar rabauti
Browse files

parandused

parent 807ca65a
...@@ -7,6 +7,7 @@ import sys ...@@ -7,6 +7,7 @@ import sys
import re import re
import difflib import difflib
import copy import copy
from datetime import datetime
from estnltk import Text from estnltk import Text
from difflib import Differ from difflib import Differ
...@@ -113,9 +114,11 @@ d2 = difflib.HtmlDiff( tabsize=4, wrapcolumn=40 ) ...@@ -113,9 +114,11 @@ d2 = difflib.HtmlDiff( tabsize=4, wrapcolumn=40 )
stats = {} stats = {}
stats['total'] = 0 stats['total'] = 0
stats['lahendatud'] = 0
stats['lahendamata1'] = 0 stats['lahendamata1'] = 0
stats['lahendamata2'] = 0 stats['lahendamata2'] = 0
filename2 = 'tulemus/tundmatu.html' filename2 = 'tulemus/tundmatu.html'
file_out = open(filename2, 'w') file_out = open(filename2, 'w')
...@@ -142,7 +145,7 @@ linenr = 0 ...@@ -142,7 +145,7 @@ linenr = 0
for uid in sorted(corrections.keys()): for uid in sorted(corrections.keys()):
linenr +=1 linenr +=1
if linenr > 100: continue #if linenr > 100: continue
to_print = 0 to_print = 0
...@@ -159,7 +162,12 @@ for uid in sorted(corrections.keys()): ...@@ -159,7 +162,12 @@ for uid in sorted(corrections.keys()):
text2 = Text(correction['text']) text2 = Text(correction['text'])
#text2_words = text2.word_texts #text2_words = text2.word_texts
corrections[uid][i]['tokenized'] = ' '.join(text2.word_texts) corrections[uid][i]['tokenized'] = ' '.join(text2.word_texts)
ndiff_result = difflib.ndiff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1)) ndiff_result = difflib.ndiff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1))
ndiff_result_list = list(ndiff_result) ndiff_result_list = list(ndiff_result)
unified_diff_result = difflib.unified_diff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1)) unified_diff_result = difflib.unified_diff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1))
...@@ -206,7 +214,25 @@ for uid in sorted(corrections.keys()): ...@@ -206,7 +214,25 @@ for uid in sorted(corrections.keys()):
text2_forms = text2_copy.forms text2_forms = text2_copy.forms
#lisame lõppu nn saba, muidu viimase sõna diff võib valeks minna
text1_lemmas.append('####')
text2_lemmas.append('####')
text1_word_texts.append('####')
text2_word_texts.append('####')
text1_postags.append('####')
text2_postags.append('####')
text1_forms.append('####')
text2_forms.append('####')
ndiff_result = difflib.ndiff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1))
ndiff_result_list = list(ndiff_result)
ndiff_result_list_copy = copy.copy(ndiff_result_list) ndiff_result_list_copy = copy.copy(ndiff_result_list)
added = [] added = []
...@@ -216,7 +242,7 @@ for uid in sorted(corrections.keys()): ...@@ -216,7 +242,7 @@ for uid in sorted(corrections.keys()):
deleted_pos = [] deleted_pos = []
#print (ndiff_result_list)
# esiteks lihtne kontroll, et tokeniseerimata originaal ja tokeniseerimata parandus ei erine omavahel # esiteks lihtne kontroll, et tokeniseerimata originaal ja tokeniseerimata parandus ei erine omavahel
if originals[uid]['text'] == correction['text']: if originals[uid]['text'] == correction['text']:
...@@ -268,7 +294,7 @@ for uid in sorted(corrections.keys()): ...@@ -268,7 +294,7 @@ for uid in sorted(corrections.keys()):
text2_postags.pop(ind) text2_postags.pop(ind)
text2_forms.pop(ind) text2_forms.pop(ind)
correction_sets.append( {'type':'punktuatsioon', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] }) correction_sets.append( {'type':'punktuatsioon', 'added': added_log, 'deleted': deleted_log })
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
#print ("+++++++") #print ("+++++++")
#print ("".join(ndiff_result_list_copy)) #print ("".join(ndiff_result_list_copy))
...@@ -277,6 +303,9 @@ for uid in sorted(corrections.keys()): ...@@ -277,6 +303,9 @@ for uid in sorted(corrections.keys()):
finished = 1 finished = 1
#print ("".join(ndiff_result_list_copy)) #print ("".join(ndiff_result_list_copy))
###########################
# sõnaasukoht
###########################
if not finished: if not finished:
#sõnade järjekorra kontrollimine #sõnade järjekorra kontrollimine
# kui sõna on lause alguses, siis võrdleme seda lowercase # kui sõna on lause alguses, siis võrdleme seda lowercase
...@@ -294,13 +323,13 @@ for uid in sorted(corrections.keys()): ...@@ -294,13 +323,13 @@ for uid in sorted(corrections.keys()):
#print (ndiff_data['deleted'][0]) #print (ndiff_data['deleted'][0])
if (sorted(ndiff_data['added']) == sorted(ndiff_data['deleted'])): if (sorted(ndiff_data['added']) == sorted(ndiff_data['deleted'])):
flags.append('sõnajärg') flags.append('sõnaasukoht')
correction_sets.append( {'type':'sõnajärg', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] }) correction_sets.append( {'type':'sõnaasukoht', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
finished = 1 finished = 1
elif intersection(ndiff_data['added'], ndiff_data['deleted']): elif intersection(ndiff_data['added'], ndiff_data['deleted']):
flags.append('sõnajärg') flags.append('sõnaasukoht')
deleted_elements_ind = [] deleted_elements_ind = []
added_elements_ind = [] added_elements_ind = []
...@@ -326,7 +355,7 @@ for uid in sorted(corrections.keys()): ...@@ -326,7 +355,7 @@ for uid in sorted(corrections.keys()):
text2_postags.pop(ind) text2_postags.pop(ind)
text2_forms.pop(ind) text2_forms.pop(ind)
correction_sets.append( {'type':'sõnajärg', 'added': added_log, 'deleted': deleted_log }) correction_sets.append( {'type':'sõnaasukoht', 'added': added_log, 'deleted': deleted_log })
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
#print ("+++++++") #print ("+++++++")
...@@ -491,7 +520,7 @@ for uid in sorted(corrections.keys()): ...@@ -491,7 +520,7 @@ for uid in sorted(corrections.keys()):
casediff = 0 casediff = 0
worddiff = 0 worddiff = 0
for token_pos in intersection(ndiff_data['added_pos'], ndiff_data['deleted_pos']): for token_pos in ndiff_data['pos_intersection']:
if text1_lemmas[token_pos] == text2_lemmas[token_pos] and text1_forms[token_pos] == text2_forms[token_pos]: if text1_lemmas[token_pos] == text2_lemmas[token_pos] and text1_forms[token_pos] == text2_forms[token_pos]:
if text1_word_texts[token_pos].lower() == text2_word_texts[token_pos].lower(): if text1_word_texts[token_pos].lower() == text2_word_texts[token_pos].lower():
casediff +=1 casediff +=1
...@@ -499,25 +528,37 @@ for uid in sorted(corrections.keys()): ...@@ -499,25 +528,37 @@ for uid in sorted(corrections.keys()):
worddiff +=1 worddiff +=1
resolved_pos.append(token_pos) resolved_pos.append(token_pos)
deleted_log = [] deleted_log_worddiff = []
added_log = [] added_log_worddiff = []
deleted_log_casediff = []
added_log_casediff = []
for ind in reversed(sorted(resolved_pos)): for ind in reversed(sorted(resolved_pos)):
text1_lemmas.pop(ind) text1_lemmas.pop(ind)
deleted_log.insert(0, text1_word_texts.pop(ind)) if text1_word_texts[ind].lower() == text2_word_texts[ind].lower():
deleted_log_casediff.insert(0, text1_word_texts.pop(ind))
added_log_casediff.insert(0, text2_word_texts.pop(ind))
else:
deleted_log_worddiff.insert(0, text1_word_texts.pop(ind))
added_log_worddiff.insert(0, text2_word_texts.pop(ind))
text1_postags.pop(ind) text1_postags.pop(ind)
text1_forms.pop(ind) text1_forms.pop(ind)
text2_lemmas.pop(ind) text2_lemmas.pop(ind)
added_log.insert(0, text2_word_texts.pop(ind))
text2_postags.pop(ind) text2_postags.pop(ind)
text2_forms.pop(ind) text2_forms.pop(ind)
if len (resolved_pos): if len (resolved_pos):
if casediff: if len(added_log_casediff):
flags.append('suurväike') flags.append('suurväike')
correction_sets.append( {'type':'suurväike', 'added': added_log, 'deleted':deleted_log}) correction_sets.append( {'type':'suurväike', 'added': added_log_casediff, 'deleted':deleted_log_casediff})
if worddiff: #print (ndiff_result_list_copy)
#print (added_log_casediff)
if len(deleted_log_worddiff):
flags.append('paralleelvorm') flags.append('paralleelvorm')
correction_sets.append( {'type':'paralleelvorm', 'added': added_log, 'deleted': deleted_log}) correction_sets.append( {'type':'paralleelvorm', 'added': added_log_worddiff, 'deleted': deleted_log_worddiff})
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result) ndiff_result_list_copy = list(new_ndiff_result)
...@@ -625,12 +666,24 @@ for uid in sorted(corrections.keys()): ...@@ -625,12 +666,24 @@ for uid in sorted(corrections.keys()):
finished = 1 finished = 1
if not finished:
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not len(ndiff_data['added']):
flags.append('sõnaüle')
correction_sets.append( {'type':'sõnaüle', 'added': [], 'deleted': ndiff_data['deleted'] })
finished =1
elif not len(ndiff_data['deleted']):
flags.append('sõnapuudu')
correction_sets.append( {'type':'sõnapuudu', 'added': ndiff_data['added'], 'deleted': [] })
finished =1
if not finished: if not finished:
flags.append('0') flags.append('0')
unified_remained_diff_result = difflib.unified_diff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) unified_remained_diff_result = difflib.unified_diff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
html_diff_result = [] html_diff_result = []
for line in unified_remained_diff_result: for line in unified_remained_diff_result:
if line.startswith('-'): if line.startswith('-'):
...@@ -641,13 +694,15 @@ for uid in sorted(corrections.keys()): ...@@ -641,13 +694,15 @@ for uid in sorted(corrections.keys()):
html_diff_result.append('<span style="color:gray">%s</span>' % line) html_diff_result.append('<span style="color:gray">%s</span>' % line)
else: else:
html_diff_result.append('%s' % line) html_diff_result.append('%s' % line)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
correction_sets.append( {'type':'lahendamata', 'added':ndiff_data['added'], 'deleted': ndiff_data['deleted']})
corrections[uid][i]['remained_diff'] = html_diff_result corrections[uid][i]['remained_diff'] = html_diff_result
if finished : if finished :
corrections[uid][i]['remained_diff'] = '' corrections[uid][i]['remained_diff'] = ''
stats['lahendatud'] += 1
# tokeniseertud originaal ja tokeniseerimata originaal ei erine omavahel # tokeniseertud originaal ja tokeniseerimata originaal ei erine omavahel
...@@ -689,8 +744,8 @@ for uid in sorted(corrections.keys()): ...@@ -689,8 +744,8 @@ for uid in sorted(corrections.keys()):
if '0' in flags and kkey in ('0', '00'): if '0' in flags and kkey in ('0', '00'):
stats['lahendamata1'] +=1 stats['lahendamata1'] +=1
elif '0' in flags: elif '0' in flags:
stats['lahendamata1'] +=1
stats['lahendamata2'] +=1 stats['lahendamata2'] +=1
file_out.write('</table>') file_out.write('</table>')
file_out.write ('</body></html>') file_out.write ('</body></html>')
file_out.close() file_out.close()
...@@ -727,12 +782,13 @@ for label in collected_flags: ...@@ -727,12 +782,13 @@ for label in collected_flags:
<body> <body>
""") """)
file_out.write( '<h5>%s</h5>' % str(datetime.now()))
file_out.write( '<h1>%s (%d)</h1>' % (label,collected_flags[label])) file_out.write( '<h1>%s (%d)</h1>' % (label,collected_flags[label]))
file_out.write( '<h3>Kontrolliti %d parandust</h3>' % (stats['total'])) file_out.write( '<h3>Kontrolliti %d parandust</h3>' % (stats['total']))
file_out.write( '<h3>Viga tuvastatud: %d</h3>' % (stats['total']- stats['lahendamata1']-stats['lahendamata2'])) file_out.write( '<h3>Viga tuvastatud: %d</h3>' % (stats['lahendatud']))
file_out.write( '<h3>Viga tuvastamata: %d (osaliselt tuvastatud %d)</h3>' % (stats['lahendamata1'] + stats['lahendamata2'] , stats['lahendamata2'])) file_out.write( '<h3>Viga tuvastamata: %d (osaliselt tuvastatud %d)</h3>' % (stats['lahendamata1'] + stats['lahendamata2'] , stats['lahendamata2']))
file_out.write( '<h4>Kirjeldus <a target="_new" href="%s.html">#:</a></h4>') file_out.write( '<h4>Kirjeldus <a target="_new" href="kirjeldus.html">#:</a></h4>')
for label2 in sorted(collected_flags): for label2 in sorted(collected_flags):
file_out.write( '<span>%s <a href="%s.html">(%d)</a></span><br/>' % (label2,label2,collected_flags[label2])) file_out.write( '<span>%s <a href="%s.html">(%d)</a></span><br/>' % (label2,label2,collected_flags[label2]))
...@@ -772,10 +828,22 @@ for label in collected_flags: ...@@ -772,10 +828,22 @@ for label in collected_flags:
#paranduste klassifitseerimine #paranduste klassifitseerimine
errorDecriptions = { errorDecriptions = {
'sõnadejärg' : { '0' : {
'order':'1',
'rows': [
[ '0', 'Tuvastamata.'],
[ '00', 'Tuvastamata. Parandused ja eksimused on lausetes kohakuti. ']
] },
'sõnaasukoht' : {
'order':'1', 'order':'1',
'rows': [ 'rows': [
[ 'sõnadejärg', 'Sõna asukoht lauses on muutunud. Sõnakuju (va. väike-suurtäht lause alguses) pole muutunud. Sõnade morfanalüüsi ei vaadata.'] [ 'sõnaasukoht', 'Sõna asukoht lauses on muutunud. Sõnakuju (va. väike-suurtäht lause alguses) pole muutunud. Sõnade morfanalüüsi ei vaadata.']
] },
'suur-väike' : {
'order':'1',
'rows': [
[ 'suur-väike', 'Sõnas muutusid suurtähed v väiketähed.']
] }, ] },
'punktuatsioon' : { 'punktuatsioon' : {
...@@ -787,7 +855,7 @@ errorDecriptions = { ...@@ -787,7 +855,7 @@ errorDecriptions = {
'kokku-lahku' : { 'kokku-lahku' : {
'order':'3', 'order':'3',
'rows': [ 'rows': [
[ 'kokku-lahku', 'Lauses on muudetud (lisatud, eemaldatud, asendatud) mõnda kirjavahemärki.'] [ 'kokku-lahku', 'Parandatud on sõna kokku-lahku kirjutamist.']
]}, ]},
'valelemma' : { 'valelemma' : {
...@@ -813,7 +881,7 @@ errorDecriptions = { ...@@ -813,7 +881,7 @@ errorDecriptions = {
'sõnaüle' : { 'sõnaüle' : {
'order':'5', 'order':'5',
'rows': [ 'rows': [
[ 'sõnaüle', 'Lauses oli sõna üle.'] [ 'sõnaüle', 'Lauses oli üleliigne sõna.']
]}, ]},
} }
...@@ -823,10 +891,12 @@ filename2 = 'tulemus/kirjeldus.html' ...@@ -823,10 +891,12 @@ filename2 = 'tulemus/kirjeldus.html'
file_out = open(filename2, 'w') file_out = open(filename2, 'w')
file_out.write( '<html> <meta charset="UTF-8"><head></head>') file_out.write( '<html> <meta charset="UTF-8"><head></head>')
file_out.write( "<body>") file_out.write( "<body>")
file_out.write( '<h5>%s</h5>' % str(datetime.now()))
file_out.write( '<h1>Tüübid</h1>') file_out.write( '<h1>Tüübid</h1>')
for key in sorted(errorDecriptions.keys()): for key in sorted(errorDecriptions.keys()):
#file_out.write( '<h2>%s</h2>' % key)
for row in errorDecriptions[key]['rows']: for row in errorDecriptions[key]['rows']:
file_out.write( '<p><b>%s</b><p><p style="padding-left:20px">%s</p>' % (row[0] , row[1])) file_out.write( '<p><b>%s</b><p><p style="padding-left:20px">%s</p>' % (row[0] , row[1]))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment