Commit 2c23061e authored by rabauti's avatar rabauti
Browse files

liigitamise logi-info parendatus

parent 6ce2d4b8
...@@ -151,15 +151,17 @@ linenr = 0 ...@@ -151,15 +151,17 @@ linenr = 0
for uid in sorted(corrections.keys()): for uid in sorted(corrections.keys()):
linenr +=1 linenr +=1
if linenr > 100: continue #if linenr > 1000: continue
to_print = 0 to_print = 0
correction_sets = []
for (i, correction) in enumerate(corrections[uid]): for (i, correction) in enumerate(corrections[uid]):
correction_sets = []
stats['total'] += 1 stats['total'] += 1
flags = [] flags = []
...@@ -299,6 +301,7 @@ for uid in sorted(corrections.keys()): ...@@ -299,6 +301,7 @@ for uid in sorted(corrections.keys()):
#print (ndiff_data['deleted'][0]) #print (ndiff_data['deleted'][0])
ndiff_data['deleted'][0] = ndiff_data['deleted'][0].lower() ndiff_data['deleted'][0] = ndiff_data['deleted'][0].lower()
#print (ndiff_data['deleted'][0]) #print (ndiff_data['deleted'][0])
if (sorted(ndiff_data['added']) == sorted(ndiff_data['deleted'])): if (sorted(ndiff_data['added']) == sorted(ndiff_data['deleted'])):
flags.append('sõnajärg') flags.append('sõnajärg')
correction_sets.append( {'type':'sõnajärg', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] }) correction_sets.append( {'type':'sõnajärg', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
...@@ -315,21 +318,26 @@ for uid in sorted(corrections.keys()): ...@@ -315,21 +318,26 @@ for uid in sorted(corrections.keys()):
#kustutame ainult esimese esinemise #kustutame ainult esimese esinemise
deleted_elements_ind.append(ndiff_data['deleted_pos'][ndiff_data['deleted'].index(token)]) deleted_elements_ind.append(ndiff_data['deleted_pos'][ndiff_data['deleted'].index(token)])
added_elements_ind.append(ndiff_data['added_pos'][ndiff_data['added'].index(token)]) added_elements_ind.append(ndiff_data['added_pos'][ndiff_data['added'].index(token)])
correction_sets.append( {'type':'sõnajärg', 'added': intersection(ndiff_data['added'], ndiff_data['deleted']), 'deleted': intersection(ndiff_data['added'], ndiff_data['deleted']) })
deleted_log = []
for ind in reversed(sorted(deleted_elements_ind)): for ind in reversed(sorted(deleted_elements_ind)):
text1_lemmas.pop(ind) text1_lemmas.pop(ind)
text1_word_texts.pop(ind) deleted_log.insert(0, text1_word_texts.pop(ind))
text1_postags.pop(ind) text1_postags.pop(ind)
text1_forms.pop(ind) text1_forms.pop(ind)
added_log = []
for ind in reversed(sorted(added_elements_ind)): for ind in reversed(sorted(added_elements_ind)):
text2_lemmas.pop(ind) text2_lemmas.pop(ind)
text2_word_texts.pop(ind) added_log.insert(0, text2_word_texts.pop(ind))
text2_postags.pop(ind) text2_postags.pop(ind)
text2_forms.pop(ind) text2_forms.pop(ind)
correction_sets.append( {'type':'sõnajärg', 'added': added_log, 'deleted': deleted_log })
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
#print ("+++++++") #print ("+++++++")
#print ("".join(ndiff_result_list_copy)) #print ("".join(ndiff_result_list_copy))
ndiff_result_list_copy = list(new_ndiff_result) ndiff_result_list_copy = list(new_ndiff_result)
...@@ -343,20 +351,28 @@ for uid in sorted(corrections.keys()): ...@@ -343,20 +351,28 @@ for uid in sorted(corrections.keys()):
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not len(ndiff_data['added']): if not len(ndiff_data['added']):
flags.append('sõnaüle') flags.append('sõnaüle')
correction_sets.append( {'type':'sõnaüle', 'added': [], 'deleted': ndiff_data['deleted'] })
finished =1 finished =1
elif not len(ndiff_data['deleted']): elif not len(ndiff_data['deleted']):
flags.append('sõnapuudu') flags.append('sõnapuudu')
correction_sets.append( {'type':'sõnapuudu', 'added': ndiff_data['added'], 'deleted': [] })
finished =1 finished =1
##################
# kokku-lahku
##################
if not finished: if not finished:
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if "".join(text1_word_texts) == "".join(text2_word_texts): if "".join(text1_word_texts) == "".join(text2_word_texts):
flags.append('kokku-lahku') flags.append('kokku-lahku')
correction_sets.append( {'type':'kokku-lahku', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
finished =1 finished =1
else: else:
#otsin lisatud sõnade seas sõnu, mis on järjest positsioonidel ja mille kokkuliitmisel saab mõne kustutatud sõna #otsin lisatud sõnade seas sõnu, mis on järjest positsioonidel ja mille kokkuliitmisel saab mõne kustutatud sõna
#otsin kustutatud sõnade seas sõnu, mis on järjest positsioonidel ja mille kokkuliitmisel saab mõne lisatud sõna #otsin kustutatud sõnade seas sõnu, mis on järjest positsioonidel ja mille kokkuliitmisel saab mõne lisatud sõna
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
#added_pos = [text1_postags[index] for index in ndiff_data['deleted_pos']] #added_pos = [text1_postags[index] for index in ndiff_data['deleted_pos']]
#deleted_pos = [text2_postags[index] for index in ndiff_data['added_pos']] #deleted_pos = [text2_postags[index] for index in ndiff_data['added_pos']]
...@@ -392,17 +408,24 @@ for uid in sorted(corrections.keys()): ...@@ -392,17 +408,24 @@ for uid in sorted(corrections.keys()):
if len(remove_deleted): if len(remove_deleted):
flags.append('kokku-lahku') flags.append('kokku-lahku')
deleted_log = []
for ind in reversed(sorted(remove_deleted)): for ind in reversed(sorted(remove_deleted)):
text1_lemmas.pop(ind) text1_lemmas.pop(ind)
text1_word_texts.pop(ind) deleted_log.insert(0, text1_word_texts.pop(ind))
text1_postags.pop(ind) text1_postags.pop(ind)
text1_forms.pop(ind) text1_forms.pop(ind)
added_log = []
for ind in reversed(sorted(remove_added)): for ind in reversed(sorted(remove_added)):
text2_lemmas.pop(ind) text2_lemmas.pop(ind)
text2_word_texts.pop(ind) added_log.insert(0, text2_word_texts.pop(ind))
text2_postags.pop(ind) text2_postags.pop(ind)
text2_forms.pop(ind) text2_forms.pop(ind)
correction_sets.append( {'type':'kokku-lahku', 'added': added_log, 'deleted': deleted_log })
text1_word_texts_old = copy.copy(text1_word_texts) text1_word_texts_old = copy.copy(text1_word_texts)
...@@ -433,18 +456,19 @@ for uid in sorted(corrections.keys()): ...@@ -433,18 +456,19 @@ for uid in sorted(corrections.keys()):
if len(remove_deleted): if len(remove_deleted):
flags.append('kokku-lahku') flags.append('kokku-lahku')
deleted_log = []
for ind in reversed(sorted(remove_deleted)): for ind in reversed(sorted(remove_deleted)):
text1_lemmas.pop(ind) text1_lemmas.pop(ind)
text1_word_texts.pop(ind) deleted_log.insert(0, text1_word_texts.pop(ind))
text1_postags.pop(ind) text1_postags.pop(ind)
text1_forms.pop(ind) text1_forms.pop(ind)
added_log = []
for ind in reversed(sorted(remove_added)): for ind in reversed(sorted(remove_added)):
text2_lemmas.pop(ind) text2_lemmas.pop(ind)
text2_word_texts.pop(ind) added_log.insert(0, text2_word_texts.pop(ind))
text2_postags.pop(ind) text2_postags.pop(ind)
text2_forms.pop(ind) text2_forms.pop(ind)
correction_sets.append( {'type':'kokku-lahku', 'added': added_log, 'deleted': deleted_log })
text1_word_texts_old = copy.copy(text1_word_texts) text1_word_texts_old = copy.copy(text1_word_texts)
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
...@@ -453,19 +477,25 @@ for uid in sorted(corrections.keys()): ...@@ -453,19 +477,25 @@ for uid in sorted(corrections.keys()):
if not ndiff_data['changed']: if not ndiff_data['changed']:
finished = 1 finished = 1
###############
# suurväike
# paralleelvorm
###############
if not finished: if not finished:
#sama sõna muu vorm #sama sõna muu vorm
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if (text1_lemmas == text2_lemmas and text1_forms == text2_forms and ' '.join(text1_word_texts).lower()==' '.join(text2_word_texts).lower() ): if (text1_lemmas == text2_lemmas and text1_forms == text2_forms and ' '.join(text1_word_texts).lower()==' '.join(text2_word_texts).lower() ):
flags.append('suurväike') flags.append('suurväike')
correction_sets.append( {'type':'suurväike', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
finished =1 finished =1
elif (text1_lemmas == text2_lemmas and text1_forms == text2_forms and not text1_word_texts==text2_word_texts ): elif (text1_lemmas == text2_lemmas and text1_forms == text2_forms and not text1_word_texts==text2_word_texts ):
flags.append('paralleelvorm') flags.append('paralleelvorm')
correction_sets.append( {'type':'paralleelvorm', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
finished =1 finished =1
else: else:
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
resolved_pos = [] resolved_pos = []
casediff = 0 casediff = 0
...@@ -478,77 +508,97 @@ for uid in sorted(corrections.keys()): ...@@ -478,77 +508,97 @@ for uid in sorted(corrections.keys()):
worddiff +=1 worddiff +=1
resolved_pos.append(token_pos) resolved_pos.append(token_pos)
if len (resolved_pos): deleted_log = []
if casediff: added_log = []
flags.append('suurväike')
if worddiff:
flags.append('paralleelvorm')
for ind in reversed(sorted(resolved_pos)): for ind in reversed(sorted(resolved_pos)):
text1_lemmas.pop(ind) text1_lemmas.pop(ind)
text1_word_texts.pop(ind) deleted_log.insert(0, text1_word_texts.pop(ind))
text1_postags.pop(ind) text1_postags.pop(ind)
text1_forms.pop(ind) text1_forms.pop(ind)
text2_lemmas.pop(ind) text2_lemmas.pop(ind)
text2_word_texts.pop(ind) added_log.insert(0, text2_word_texts.pop(ind))
text2_postags.pop(ind) text2_postags.pop(ind)
text2_forms.pop(ind) text2_forms.pop(ind)
if len (resolved_pos):
if casediff:
flags.append('suurväike')
correction_sets.append( {'type':'suurväike', 'added': added_log, 'deleted':deleted_log})
if worddiff:
flags.append('paralleelvorm')
correction_sets.append( {'type':'paralleelvorm', 'added': added_log, 'deleted': deleted_log})
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result) ndiff_result_list_copy = list(new_ndiff_result)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not ndiff_data['changed']: if not ndiff_data['changed']:
finished = 1 finished = 1
###################
# valevorm
####################
if not finished: if not finished:
#sama sõna muu vorm #sama sõna muu vorm
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
#vaatas kogu teksti #vaatas kogu teksti
if (text1_lemmas == text2_lemmas and not text1_word_texts==text2_word_texts ): if (text1_lemmas == text2_lemmas and not text1_word_texts==text2_word_texts ):
flags.append('valevorm') flags.append('valevorm')
correction_sets.append( {'type':'valevorm', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
finished =1 finished =1
#vaatame samal positsioonil asuvaid asendusi #vaatame samal positsioonil asuvaid asendusi
else: else:
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
resolved_pos = [] resolved_pos = []
for token_pos in intersection(ndiff_data['added_pos'], ndiff_data['deleted_pos']): for token_pos in ndiff_data['pos_intersection']:
if len(intersection(text1_lemmas[token_pos].split('|'), text2_lemmas[token_pos].split('|'))): if len(intersection(text1_lemmas[token_pos].split('|'), text2_lemmas[token_pos].split('|'))):
# if text1_lemmas[token_pos] == text2_lemmas[token_pos]: # if text1_lemmas[token_pos] == text2_lemmas[token_pos]:
resolved_pos.append(token_pos) resolved_pos.append(token_pos)
if len (resolved_pos):
flags.append('valevorm')
deleted_log = []
added_log = []
for ind in reversed(sorted(resolved_pos)): for ind in reversed(sorted(resolved_pos)):
text1_lemmas.pop(ind) text1_lemmas.pop(ind)
text1_word_texts.pop(ind) deleted_log.insert(0, text1_word_texts.pop(ind))
text1_postags.pop(ind) text1_postags.pop(ind)
text1_forms.pop(ind) text1_forms.pop(ind)
text2_lemmas.pop(ind) text2_lemmas.pop(ind)
text2_word_texts.pop(ind) added_log.insert(0, text2_word_texts.pop(ind))
text2_postags.pop(ind) text2_postags.pop(ind)
text2_forms.pop(ind) text2_forms.pop(ind)
if len (resolved_pos):
flags.append('valevorm')
correction_sets.append( {'type':'valevorm', 'added': added_log, 'deleted': deleted_log})
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result) ndiff_result_list_copy = list(new_ndiff_result)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not ndiff_data['changed']: if not ndiff_data['changed']:
finished = 1 finished = 1
###################
# valelemma
####################
if not finished: if not finished:
#sama vorm muu sõna #sama vorm muu sõna
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if (text1_forms == text2_forms and not text1_lemmas==text2_lemmas ): if (text1_forms == text2_forms and not text1_lemmas==text2_lemmas ):
flags.append('valelemma') flags.append('valelemma')
correction_sets.append( {'type':'valelemma', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
finished =1 finished =1
#vaatame samal positsioonil asuvaid asendusi #vaatame samal positsioonil asuvaid asendusi
else: else:
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
resolved_pos = [] resolved_pos = []
for token_pos in intersection(ndiff_data['added_pos'], ndiff_data['deleted_pos']): for token_pos in ndiff_data['pos_intersection']:
sub_type = '' sub_type = ''
if text1_forms[token_pos] == text2_forms[token_pos] and not text1_lemmas[token_pos]==text2_lemmas[token_pos]: if text1_forms[token_pos] == text2_forms[token_pos] and not text1_lemmas[token_pos]==text2_lemmas[token_pos]:
resolved_pos.append(token_pos) resolved_pos.append(token_pos)
...@@ -560,20 +610,23 @@ for uid in sorted(corrections.keys()): ...@@ -560,20 +610,23 @@ for uid in sorted(corrections.keys()):
resolved_pos.append(token_pos) resolved_pos.append(token_pos)
if len (resolved_pos):
flags.append('valelemma'+sub_type)
deleted_log = []
added_log = []
for ind in reversed(sorted(resolved_pos)): for ind in reversed(sorted(resolved_pos)):
text1_lemmas.pop(ind) text1_lemmas.pop(ind)
text1_word_texts.pop(ind) deleted_log.insert(0, text1_word_texts.pop(ind))
text1_postags.pop(ind) text1_postags.pop(ind)
text1_forms.pop(ind) text1_forms.pop(ind)
text2_lemmas.pop(ind) text2_lemmas.pop(ind)
text2_word_texts.pop(ind) added_log.insert(0, text2_word_texts.pop(ind))
text2_postags.pop(ind) text2_postags.pop(ind)
text2_forms.pop(ind) text2_forms.pop(ind)
if len (resolved_pos):
flags.append('valelemma')
correction_sets.append( {'type':'valelemma'+sub_type, 'added': added_log, 'deleted': deleted_log})
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1)) new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result) ndiff_result_list_copy = list(new_ndiff_result)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy) ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
...@@ -581,38 +634,7 @@ for uid in sorted(corrections.keys()): ...@@ -581,38 +634,7 @@ for uid in sorted(corrections.keys()):
finished = 1 finished = 1
if not finished:
#nud<->nu tud<->tu nudtud
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
resolved_pos = []
for token_pos in intersection(ndiff_data['added_pos'], ndiff_data['deleted_pos']):
if ( 'nud' in text1_forms[token_pos].split('|') or 'tud' in text1_forms[token_pos].split('|') ) and len(intersection(text1_lemmas[token_pos].split('|'), text2_lemmas[token_pos].split('|')) ):
resolved_pos.append(token_pos)
if len (resolved_pos):
#print('here')
flags.append('nudtud')
for ind in reversed(sorted(resolved_pos)):
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
text1_postags.pop(ind)
text1_forms.pop(ind)
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
text2_postags.pop(ind)
text2_forms.pop(ind)
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not ndiff_data['changed']:
finished = 1
if not finished: if not finished:
flags.append('0') flags.append('0')
...@@ -741,7 +763,7 @@ for label in collected_flags: ...@@ -741,7 +763,7 @@ for label in collected_flags:
for corr_set in corrections[uid][i]['correction_sets']: for corr_set in corrections[uid][i]['correction_sets']:
rows_html += '<tr><td>&nbsp;</td><td>%s</td><td><span style="color:red">%s</span> &lt;=== &gt; <span style="color:green">%s</td><td></td></tr>\n' % (corr_set['type'], ' '.join(corr_set['deleted']) , ' '.join(corr_set['added'] ) ) rows_html += '<tr><td>&nbsp;</td><td>%s</td><td><span style="color:red">%s</span> ---&gt; <span style="color:green">%s</td><td></td></tr>\n' % (corr_set['type'], ' '.join(corr_set['deleted']) , ' '.join(corr_set['added'] ) )
rows_html += '<tr><td colspan="4">&nbsp;</td></tr>\n' rows_html += '<tr><td colspan="4">&nbsp;</td></tr>\n'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment