Commit 2c23061e authored by rabauti's avatar rabauti
Browse files

liigitamise logi-info parendatus

parent 6ce2d4b8
......@@ -151,15 +151,17 @@ linenr = 0
for uid in sorted(corrections.keys()):
linenr +=1
if linenr > 100: continue
#if linenr > 1000: continue
to_print = 0
correction_sets = []
for (i, correction) in enumerate(corrections[uid]):
correction_sets = []
stats['total'] += 1
flags = []
......@@ -299,6 +301,7 @@ for uid in sorted(corrections.keys()):
#print (ndiff_data['deleted'][0])
ndiff_data['deleted'][0] = ndiff_data['deleted'][0].lower()
#print (ndiff_data['deleted'][0])
if (sorted(ndiff_data['added']) == sorted(ndiff_data['deleted'])):
flags.append('sõnajärg')
correction_sets.append( {'type':'sõnajärg', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
......@@ -315,21 +318,26 @@ for uid in sorted(corrections.keys()):
#kustutame ainult esimese esinemise
deleted_elements_ind.append(ndiff_data['deleted_pos'][ndiff_data['deleted'].index(token)])
added_elements_ind.append(ndiff_data['added_pos'][ndiff_data['added'].index(token)])
correction_sets.append( {'type':'sõnajärg', 'added': intersection(ndiff_data['added'], ndiff_data['deleted']), 'deleted': intersection(ndiff_data['added'], ndiff_data['deleted']) })
deleted_log = []
for ind in reversed(sorted(deleted_elements_ind)):
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
deleted_log.insert(0, text1_word_texts.pop(ind))
text1_postags.pop(ind)
text1_forms.pop(ind)
added_log = []
for ind in reversed(sorted(added_elements_ind)):
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
added_log.insert(0, text2_word_texts.pop(ind))
text2_postags.pop(ind)
text2_forms.pop(ind)
correction_sets.append( {'type':'sõnajärg', 'added': added_log, 'deleted': deleted_log })
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
#print ("+++++++")
#print ("".join(ndiff_result_list_copy))
ndiff_result_list_copy = list(new_ndiff_result)
......@@ -343,20 +351,28 @@ for uid in sorted(corrections.keys()):
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not len(ndiff_data['added']):
flags.append('sõnaüle')
correction_sets.append( {'type':'sõnaüle', 'added': [], 'deleted': ndiff_data['deleted'] })
finished =1
elif not len(ndiff_data['deleted']):
flags.append('sõnapuudu')
correction_sets.append( {'type':'sõnapuudu', 'added': ndiff_data['added'], 'deleted': [] })
finished =1
##################
# kokku-lahku
##################
if not finished:
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if "".join(text1_word_texts) == "".join(text2_word_texts):
flags.append('kokku-lahku')
correction_sets.append( {'type':'kokku-lahku', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
finished =1
else:
#otsin lisatud sõnade seas sõnu, mis on järjest positsioonidel ja mille kokkuliitmisel saab mõne kustutatud sõna
#otsin kustutatud sõnade seas sõnu, mis on järjest positsioonidel ja mille kokkuliitmisel saab mõne lisatud sõna
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
#added_pos = [text1_postags[index] for index in ndiff_data['deleted_pos']]
#deleted_pos = [text2_postags[index] for index in ndiff_data['added_pos']]
......@@ -392,17 +408,24 @@ for uid in sorted(corrections.keys()):
if len(remove_deleted):
flags.append('kokku-lahku')
deleted_log = []
for ind in reversed(sorted(remove_deleted)):
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
deleted_log.insert(0, text1_word_texts.pop(ind))
text1_postags.pop(ind)
text1_forms.pop(ind)
added_log = []
for ind in reversed(sorted(remove_added)):
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
added_log.insert(0, text2_word_texts.pop(ind))
text2_postags.pop(ind)
text2_forms.pop(ind)
correction_sets.append( {'type':'kokku-lahku', 'added': added_log, 'deleted': deleted_log })
text1_word_texts_old = copy.copy(text1_word_texts)
......@@ -433,18 +456,19 @@ for uid in sorted(corrections.keys()):
if len(remove_deleted):
flags.append('kokku-lahku')
deleted_log = []
for ind in reversed(sorted(remove_deleted)):
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
deleted_log.insert(0, text1_word_texts.pop(ind))
text1_postags.pop(ind)
text1_forms.pop(ind)
added_log = []
for ind in reversed(sorted(remove_added)):
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
added_log.insert(0, text2_word_texts.pop(ind))
text2_postags.pop(ind)
text2_forms.pop(ind)
correction_sets.append( {'type':'kokku-lahku', 'added': added_log, 'deleted': deleted_log })
text1_word_texts_old = copy.copy(text1_word_texts)
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
......@@ -453,19 +477,25 @@ for uid in sorted(corrections.keys()):
if not ndiff_data['changed']:
finished = 1
###############
# suurväike
# paralleelvorm
###############
if not finished:
#sama sõna muu vorm
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if (text1_lemmas == text2_lemmas and text1_forms == text2_forms and ' '.join(text1_word_texts).lower()==' '.join(text2_word_texts).lower() ):
flags.append('suurväike')
correction_sets.append( {'type':'suurväike', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
finished =1
elif (text1_lemmas == text2_lemmas and text1_forms == text2_forms and not text1_word_texts==text2_word_texts ):
flags.append('paralleelvorm')
correction_sets.append( {'type':'paralleelvorm', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
finished =1
else:
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
resolved_pos = []
casediff = 0
......@@ -478,77 +508,97 @@ for uid in sorted(corrections.keys()):
worddiff +=1
resolved_pos.append(token_pos)
if len (resolved_pos):
if casediff:
flags.append('suurväike')
if worddiff:
flags.append('paralleelvorm')
deleted_log = []
added_log = []
for ind in reversed(sorted(resolved_pos)):
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
deleted_log.insert(0, text1_word_texts.pop(ind))
text1_postags.pop(ind)
text1_forms.pop(ind)
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
added_log.insert(0, text2_word_texts.pop(ind))
text2_postags.pop(ind)
text2_forms.pop(ind)
if len (resolved_pos):
if casediff:
flags.append('suurväike')
correction_sets.append( {'type':'suurväike', 'added': added_log, 'deleted':deleted_log})
if worddiff:
flags.append('paralleelvorm')
correction_sets.append( {'type':'paralleelvorm', 'added': added_log, 'deleted': deleted_log})
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not ndiff_data['changed']:
finished = 1
###################
# valevorm
####################
if not finished:
#sama sõna muu vorm
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
#vaatas kogu teksti
if (text1_lemmas == text2_lemmas and not text1_word_texts==text2_word_texts ):
flags.append('valevorm')
correction_sets.append( {'type':'valevorm', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
finished =1
#vaatame samal positsioonil asuvaid asendusi
else:
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
resolved_pos = []
for token_pos in intersection(ndiff_data['added_pos'], ndiff_data['deleted_pos']):
for token_pos in ndiff_data['pos_intersection']:
if len(intersection(text1_lemmas[token_pos].split('|'), text2_lemmas[token_pos].split('|'))):
# if text1_lemmas[token_pos] == text2_lemmas[token_pos]:
resolved_pos.append(token_pos)
if len (resolved_pos):
flags.append('valevorm')
deleted_log = []
added_log = []
for ind in reversed(sorted(resolved_pos)):
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
deleted_log.insert(0, text1_word_texts.pop(ind))
text1_postags.pop(ind)
text1_forms.pop(ind)
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
added_log.insert(0, text2_word_texts.pop(ind))
text2_postags.pop(ind)
text2_forms.pop(ind)
if len (resolved_pos):
flags.append('valevorm')
correction_sets.append( {'type':'valevorm', 'added': added_log, 'deleted': deleted_log})
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not ndiff_data['changed']:
finished = 1
###################
# valelemma
####################
if not finished:
#sama vorm muu sõna
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if (text1_forms == text2_forms and not text1_lemmas==text2_lemmas ):
flags.append('valelemma')
correction_sets.append( {'type':'valelemma', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
finished =1
#vaatame samal positsioonil asuvaid asendusi
else:
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
resolved_pos = []
for token_pos in intersection(ndiff_data['added_pos'], ndiff_data['deleted_pos']):
for token_pos in ndiff_data['pos_intersection']:
sub_type = ''
if text1_forms[token_pos] == text2_forms[token_pos] and not text1_lemmas[token_pos]==text2_lemmas[token_pos]:
resolved_pos.append(token_pos)
......@@ -560,20 +610,23 @@ for uid in sorted(corrections.keys()):
resolved_pos.append(token_pos)
if len (resolved_pos):
flags.append('valelemma'+sub_type)
deleted_log = []
added_log = []
for ind in reversed(sorted(resolved_pos)):
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
deleted_log.insert(0, text1_word_texts.pop(ind))
text1_postags.pop(ind)
text1_forms.pop(ind)
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
added_log.insert(0, text2_word_texts.pop(ind))
text2_postags.pop(ind)
text2_forms.pop(ind)
if len (resolved_pos):
flags.append('valelemma')
correction_sets.append( {'type':'valelemma'+sub_type, 'added': added_log, 'deleted': deleted_log})
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
......@@ -581,38 +634,7 @@ for uid in sorted(corrections.keys()):
finished = 1
if not finished:
#nud<->nu tud<->tu nudtud
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
resolved_pos = []
for token_pos in intersection(ndiff_data['added_pos'], ndiff_data['deleted_pos']):
if ( 'nud' in text1_forms[token_pos].split('|') or 'tud' in text1_forms[token_pos].split('|') ) and len(intersection(text1_lemmas[token_pos].split('|'), text2_lemmas[token_pos].split('|')) ):
resolved_pos.append(token_pos)
if len (resolved_pos):
#print('here')
flags.append('nudtud')
for ind in reversed(sorted(resolved_pos)):
text1_lemmas.pop(ind)
text1_word_texts.pop(ind)
text1_postags.pop(ind)
text1_forms.pop(ind)
text2_lemmas.pop(ind)
text2_word_texts.pop(ind)
text2_postags.pop(ind)
text2_forms.pop(ind)
new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
ndiff_result_list_copy = list(new_ndiff_result)
ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
if not ndiff_data['changed']:
finished = 1
if not finished:
flags.append('0')
......@@ -741,7 +763,7 @@ for label in collected_flags:
for corr_set in corrections[uid][i]['correction_sets']:
rows_html += '<tr><td>&nbsp;</td><td>%s</td><td><span style="color:red">%s</span> &lt;=== &gt; <span style="color:green">%s</td><td></td></tr>\n' % (corr_set['type'], ' '.join(corr_set['deleted']) , ' '.join(corr_set['added'] ) )
rows_html += '<tr><td>&nbsp;</td><td>%s</td><td><span style="color:red">%s</span> ---&gt; <span style="color:green">%s</td><td></td></tr>\n' % (corr_set['type'], ' '.join(corr_set['deleted']) , ' '.join(corr_set['added'] ) )
rows_html += '<tr><td colspan="4">&nbsp;</td></tr>\n'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment