liigitus2.py 26.4 KB
Newer Older
rabauti's avatar
rabauti committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#/usr/bin/python
# -*- coding: utf-8 -*-


import os
import sys
import re
import  difflib
import  copy

from estnltk import Text
from  difflib import Differ

#proovime mõned lihtsamad vead tuvastada
#Lähtefail ../korpus_tsv

#loeme kõik laused sisse, teeme dict

#item{'senetenceID'}['a'] = {'id': 'sentence'}
#item{'senetenceID'}['p'][0..n] = {'id': 'sentence'}



originals 	= {} 
corrections 	= {} 

script_name = (os.path.realpath(__file__))
script_dir = os.path.dirname(script_name)

def intersection(lst1, lst2):
	if not lst1 or not len(lst1): return []
	if not lst2 or not len(lst2): return []
	return list(set(lst1) & set(lst2))


def get_data_from_ndiff(ndiff_list):
	added = []
	deleted = []
	added_pos = []
	deleted_pos = []
	line_nr_new = -1
	line_nr_old = -1
	for line in ndiff_list:
		token = line.rstrip()[2:]

		if line.startswith('-'):
			line_nr_old += 1
			deleted.append(token)
			deleted_pos.append(line_nr_old)
			continue
				
		if line.startswith('+'):
			line_nr_new += 1
			added.append(token)
			added_pos.append(line_nr_new)
			continue
		
		if line.startswith('?'): continue
		
		line_nr_old += 1
		line_nr_new += 1
	changed = True
	if not len(added_pos) and not len(deleted_pos):
		changed = False
	
		
	return ({'added':added , 'added_pos':added_pos, 'deleted':deleted, 'deleted_pos':deleted_pos, 'changed':changed, 'pos_intersection':sorted(intersection(added_pos,deleted_pos ))})


filename = script_dir +'/../korpus_tsv/korpus.tsv'
#outdir = script_dir +'/korpus_xml'

file_input = open(filename, 'r')



for line in file_input:
	
	line = line.rstrip()
	line_arr = line.split('\t')
	
	if not len(line_arr) == 2:
		print ('ERROR :' , line)
		exit(1)
	
	uid = line_arr[0]
	text = line_arr[1]
	
	uid_arr = uid.split('_')
	uid_ending = uid_arr.pop()
	uid2 = '_'.join(uid_arr)
	el_type = uid_ending[0]
	
	if el_type == 'a':
		originals[uid2] = {'id': uid, 'text':text.strip()}
	
	if el_type == 'p':
		if not uid2 in corrections:
			corrections[uid2] = []
		corrections[uid2].append({'id': uid, 'text':text.strip()})

		
		

collected_flags = {}
rabauti's avatar
rabauti committed
106
107
108



rabauti's avatar
rabauti committed
109
110
111
112
113
114
115
116
117

# alustame parandustest
d = Differ()
d2 = difflib.HtmlDiff( tabsize=4, wrapcolumn=40 )

stats = {}
stats['total'] = 0
stats['lahendamata1'] = 0
stats['lahendamata2'] = 0
rabauti's avatar
rabauti committed
118

rabauti's avatar
rabauti committed
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
filename2 = 'tulemus/tundmatu.html'
file_out = open(filename2, 'w')

file_out.write( """<html> <meta charset="UTF-8"><head></head>
<style>
table   {
	margin-top: 50px;
	width: 1000px;
	border-collapse: collapse;
	}
table, th, td 
{
	border: 1px solid black;
	vertical-align:top;
	padding: 5px;
}
</style>
<body>
""") 

file_out.write( '<h1>%s</h1>' % ('Tuvastamata veaga'))
file_out.write( '<table style="padding:5px">')
linenr = 0
for uid in sorted(corrections.keys()):
	
	linenr +=1
rabauti's avatar
rabauti committed
145
	if linenr > 100: continue
rabauti's avatar
rabauti committed
146
147
148
149
	to_print = 0


	for (i, correction) in enumerate(corrections[uid]):
rabauti's avatar
rabauti committed
150
151
152
		
		correction_sets = []
		
rabauti's avatar
rabauti committed
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
		stats['total'] += 1
		flags = []
		
		text1 = Text(originals[uid]['text'])
		originals[uid]['tokenized'] = ' '.join(text1.word_texts)

		text2 = Text(correction['text'])
		#text2_words = text2.word_texts
		
		corrections[uid][i]['tokenized'] = ' '.join(text2.word_texts)
		ndiff_result = difflib.ndiff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1))
		ndiff_result_list = list(ndiff_result)
		unified_diff_result = difflib.unified_diff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1))

		html_diff_result = []
		for line in unified_diff_result:
			if line.startswith('-'):
				html_diff_result.append('<span style="color:red">%s</span>' % line)
			elif line.startswith('+'):
				html_diff_result.append('<span style="color:green">%s</span>' % line)
			elif line.startswith('?'):
				html_diff_result.append('<span style="color:gray">%s</span>' % line)
			else:
				html_diff_result.append('%s' % line)
		corrections[uid][i]['unified_diff_result'] = list(unified_diff_result)		
		corrections[uid][i]['html_diff_result'] = html_diff_result
		
		
		#algsed tööks vajalikud massiivid
		#text1
		#text2 
		
		#ndiff_result
		#unified_diff_result
		#et originaal säiliks töötame edasi koopiatega
		
		#finished
		
		finished = 0
		text1_copy  = Text(originals[uid]['text'])
		text2_copy= Text(correction['text'])
		
		text1_lemmas = text1_copy.lemmas
		text2_lemmas = text2_copy.lemmas
		
		
		text1_word_texts = text1_copy.word_texts
		text2_word_texts = text2_copy.word_texts
		
		text1_postags = text1_copy.postags
		text2_postags = text2_copy.postags
	
		text1_forms = text1_copy.forms
		text2_forms = text2_copy.forms
	
	
		
		ndiff_result_list_copy = copy.copy(ndiff_result_list)
		
		added = []
		deleted = []
		
		added_pos = []
		deleted_pos = []
		
		
		
		
		# esiteks lihtne kontroll, et tokeniseerimata originaal ja tokeniseerimata parandus ei erine omavahel
		if originals[uid]['text'] == correction['text']:
			flags.append('puudub')
			finished = 1
		if not finished and originals[uid]['tokenized'] == correction['tokenized']:
			flags.append('tühik')
			finished = 1
		
		#vaatame, kas kirjavahemärkidega tehti midagi
		if not finished:
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
			added_pos = [text1_postags[index] for index in ndiff_data['deleted_pos']]
			deleted_pos = [text2_postags[index] for index in ndiff_data['added_pos']]
			
			added_pos_uniq = list(set([text1_postags[index] for index in ndiff_data['deleted_pos']]))
			deleted_pos_uniq = list(set([text2_postags[index] for index in ndiff_data['added_pos']]))
			
			#print ('added_pos_uniq', added_pos_uniq)
			#print ('deleted_pos_uniq', deleted_pos_uniq)
			rowsets = {}
			#muudatused on seatud ainult kirjavahemärkidega
			if ''.join(added_pos_uniq) in ['Z', ''] and ''.join(deleted_pos_uniq) in ['Z', '']:
				flags.append('punktuatsioon')
rabauti's avatar
rabauti committed
244
				correction_sets.append( {'type':'punktuatsioon', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
245
246
247
248
249
250
251
252
				
				
				finished = 1
			#muudatused on ka kirjavahemärkidega
			elif 'Z' in added_pos + deleted_pos:
				flags.append('punktuatsioon')
				
				#teeme siin sellise sammu, kus eemaldame kõik kirjavahemärgid ja saame uuesti võrrelda
rabauti's avatar
rabauti committed
253
				deleted_log = []
rabauti's avatar
rabauti committed
254
255
256
257
				for (ind, pos) in reversed(list( enumerate(text1_postags))):
					
					if pos == 'Z':
						text1_lemmas.pop(ind)
rabauti's avatar
rabauti committed
258
						deleted_log.insert(0, text1_word_texts.pop(ind))
rabauti's avatar
rabauti committed
259
260
261
262
						text1_postags.pop(ind)
						text1_forms.pop(ind)
						
				removed = []
rabauti's avatar
rabauti committed
263
				added_log = []
rabauti's avatar
rabauti committed
264
265
266
				for (ind, pos) in reversed(list( enumerate(text2_postags))):
					if pos == 'Z':
						text2_lemmas.pop(ind)
rabauti's avatar
rabauti committed
267
						added_log.insert(0, text2_word_texts.pop(ind))
rabauti's avatar
rabauti committed
268
269
270
						text2_postags.pop(ind)
						text2_forms.pop(ind)
				
rabauti's avatar
rabauti committed
271
				correction_sets.append( {'type':'punktuatsioon', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
				new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
				#print ("+++++++")
				#print ("".join(ndiff_result_list_copy))
				ndiff_result_list_copy = list(new_ndiff_result)
				if not len(ndiff_result_list_copy):
					finished = 1
				#print ("".join(ndiff_result_list_copy))
				
		if not finished:
			#sõnade järjekorra kontrollimine
			# kui sõna on lause alguses, siis võrdleme seda lowercase
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
			
			#keerame esimesed sõnad lowercase
			if 0 in ndiff_data['added_pos']:
				#print (ndiff_data['added'][0])
				ndiff_data['added'][0] = ndiff_data['added'][0].lower()
				#print (ndiff_data['added'][0])
				
			if 0 in ndiff_data['deleted_pos']:
				#print (ndiff_data['deleted'][0])
				ndiff_data['deleted'][0] = ndiff_data['deleted'][0].lower()
				#print (ndiff_data['deleted'][0])
rabauti's avatar
rabauti committed
295
			
rabauti's avatar
rabauti committed
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
			if (sorted(ndiff_data['added']) == sorted(ndiff_data['deleted'])):
				flags.append('sõnajärg')
				correction_sets.append( {'type':'sõnajärg', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
				
				finished = 1
				
			elif intersection(ndiff_data['added'], ndiff_data['deleted']):
				flags.append('sõnajärg')
				
				deleted_elements_ind = []
				added_elements_ind = []
				#kustutame ära need sõnad, mis vahetasid asukohta
				for token in intersection(ndiff_data['added'], ndiff_data['deleted']):
					#kustutame ainult esimese esinemise
					deleted_elements_ind.append(ndiff_data['deleted_pos'][ndiff_data['deleted'].index(token)])
					added_elements_ind.append(ndiff_data['added_pos'][ndiff_data['added'].index(token)])
rabauti's avatar
rabauti committed
312
313
				
				
rabauti's avatar
rabauti committed
314
					
rabauti's avatar
rabauti committed
315
				deleted_log = []
rabauti's avatar
rabauti committed
316
317
				for ind in reversed(sorted(deleted_elements_ind)):
					text1_lemmas.pop(ind)
rabauti's avatar
rabauti committed
318
					deleted_log.insert(0, text1_word_texts.pop(ind))
rabauti's avatar
rabauti committed
319
320
321
					text1_postags.pop(ind)
					text1_forms.pop(ind)

rabauti's avatar
rabauti committed
322
				added_log = []
rabauti's avatar
rabauti committed
323
324
				for ind in reversed(sorted(added_elements_ind)):
					text2_lemmas.pop(ind)
rabauti's avatar
rabauti committed
325
					added_log.insert(0, text2_word_texts.pop(ind))
rabauti's avatar
rabauti committed
326
327
328
					text2_postags.pop(ind)
					text2_forms.pop(ind)
			
rabauti's avatar
rabauti committed
329
				correction_sets.append( {'type':'sõnajärg', 'added':  added_log, 'deleted': deleted_log })
rabauti's avatar
rabauti committed
330
				new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
rabauti's avatar
rabauti committed
331
			
rabauti's avatar
rabauti committed
332
333
334
335
336
337
338
339
340
341
342
343
344
				#print ("+++++++")
				#print ("".join(ndiff_result_list_copy))
				ndiff_result_list_copy = list(new_ndiff_result)
				ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
				if not ndiff_data['changed']:
					finished = 1
				#print ("".join(ndiff_result_list_copy))
			
			
		if not finished:
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
			if not len(ndiff_data['added']):
				flags.append('sõnaüle')
rabauti's avatar
rabauti committed
345
				correction_sets.append( {'type':'sõnaüle', 'added':  [], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
346
347
348
				finished =1
			elif not len(ndiff_data['deleted']):
				flags.append('sõnapuudu')
rabauti's avatar
rabauti committed
349
				correction_sets.append( {'type':'sõnapuudu', 'added':  ndiff_data['added'], 'deleted': [] })
rabauti's avatar
rabauti committed
350
351
				finished =1
		
rabauti's avatar
rabauti committed
352
353
354
355
		
		##################
		#	kokku-lahku 
		##################
rabauti's avatar
rabauti committed
356
357
		if not finished:
			
rabauti's avatar
rabauti committed
358
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
rabauti's avatar
rabauti committed
359
360
			if "".join(text1_word_texts) == "".join(text2_word_texts):
				flags.append('kokku-lahku')
rabauti's avatar
rabauti committed
361
				correction_sets.append( {'type':'kokku-lahku',  'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
362
363
364
365
				finished =1
			else:
				#otsin lisatud sõnade seas sõnu, mis on järjest positsioonidel ja mille kokkuliitmisel saab mõne kustutatud sõna
				#otsin kustutatud sõnade seas sõnu, mis on järjest positsioonidel ja mille kokkuliitmisel saab mõne lisatud sõna
rabauti's avatar
rabauti committed
366
				
rabauti's avatar
rabauti committed
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
				#added_pos = [text1_postags[index] for index in ndiff_data['deleted_pos']]
				#deleted_pos = [text2_postags[index] for index in ndiff_data['added_pos']]
				
				joined = {}
				
				
				text1_word_texts_old  = []
				
				#kokku liidetud sõnad
				while not text1_word_texts == text1_word_texts_old:
					
					new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
					ndiff_result_list_copy = list(new_ndiff_result)
					text1_word_texts_old = copy.copy(text1_word_texts)
					ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
					
					
					remove_added = []
					remove_deleted = []
					
					for pos in reversed(ndiff_data['deleted_pos']):
						ind = ndiff_data['deleted_pos'].index(pos)
						if pos+1 in ndiff_data['deleted_pos']:

							joinedword = ndiff_data['deleted'][ind] + ndiff_data['deleted'][ind+1]
							if joinedword in ndiff_data['added']:
								#print (ndiff_data['deleted'], ndiff_data['added'])
								remove_added.append(ndiff_data['added_pos'][ndiff_data['added'].index(joinedword)])
								remove_deleted.append(pos)
								remove_deleted.append(pos+1)
								#print (remove_deleted, remove_added)
							break
					
					if len(remove_deleted):
						flags.append('kokku-lahku')
rabauti's avatar
rabauti committed
402
403
						
						deleted_log = []
rabauti's avatar
rabauti committed
404
						for ind in reversed(sorted(remove_deleted)):
rabauti's avatar
rabauti committed
405
							
rabauti's avatar
rabauti committed
406
							text1_lemmas.pop(ind)
rabauti's avatar
rabauti committed
407
							deleted_log.insert(0, text1_word_texts.pop(ind))
rabauti's avatar
rabauti committed
408
409
							text1_postags.pop(ind)
							text1_forms.pop(ind)
rabauti's avatar
rabauti committed
410
411
							
						added_log = []
rabauti's avatar
rabauti committed
412
413
						for ind in reversed(sorted(remove_added)):	
							text2_lemmas.pop(ind)
rabauti's avatar
rabauti committed
414
							added_log.insert(0, text2_word_texts.pop(ind))
rabauti's avatar
rabauti committed
415
416
							text2_postags.pop(ind)
							text2_forms.pop(ind)
rabauti's avatar
rabauti committed
417
418
419
						correction_sets.append( {'type':'kokku-lahku', 'added': added_log, 'deleted': deleted_log })		
						
							
rabauti's avatar
rabauti committed
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
							
					text1_word_texts_old = copy.copy(text1_word_texts)
				
				text1_word_texts_old  = []
				#lahku tõstetud sõnad
				while not text1_word_texts == text1_word_texts_old:
					
					new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
					ndiff_result_list_copy = list(new_ndiff_result)
					text1_word_texts_old = copy.copy(text1_word_texts)
					ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
					
					
					remove_added = []
					remove_deleted = []
					for pos in reversed(ndiff_data['added_pos']):
						ind = ndiff_data['added_pos'].index(pos)
						if pos+1 in ndiff_data['added_pos']:

							joinedword = ndiff_data['added'][ind] + ndiff_data['added'][ind+1]
							if joinedword in ndiff_data['deleted']:
								#print (ndiff_data['added'], ndiff_data['deleted'])
								remove_deleted.append(ndiff_data['deleted_pos'][ndiff_data['deleted'].index(joinedword)])
								remove_added.append(pos)
								remove_added.append(pos+1)
								#print (remove_added, remove_deleted)
							break
					
					if len(remove_deleted):
						flags.append('kokku-lahku')
rabauti's avatar
rabauti committed
450
						deleted_log = []
rabauti's avatar
rabauti committed
451
452
						for ind in reversed(sorted(remove_deleted)):
							text1_lemmas.pop(ind)
rabauti's avatar
rabauti committed
453
							deleted_log.insert(0, text1_word_texts.pop(ind))
rabauti's avatar
rabauti committed
454
455
							text1_postags.pop(ind)
							text1_forms.pop(ind)
rabauti's avatar
rabauti committed
456
						added_log = []
rabauti's avatar
rabauti committed
457
458
						for ind in reversed(sorted(remove_added)):	
							text2_lemmas.pop(ind)
rabauti's avatar
rabauti committed
459
							added_log.insert(0, text2_word_texts.pop(ind))
rabauti's avatar
rabauti committed
460
461
							text2_postags.pop(ind)
							text2_forms.pop(ind)
rabauti's avatar
rabauti committed
462
						correction_sets.append( {'type':'kokku-lahku', 'added': added_log, 'deleted': deleted_log })
rabauti's avatar
rabauti committed
463
464
465
466
467
468
469
470
					text1_word_texts_old = copy.copy(text1_word_texts)
					
				new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
				ndiff_result_list_copy = list(new_ndiff_result)
				ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
				if not ndiff_data['changed']:
					finished = 1
		
rabauti's avatar
rabauti committed
471
472
473
474
		###############
		#	suurväike
		#	paralleelvorm
		###############
rabauti's avatar
rabauti committed
475
476
		if not finished:
			#sama sõna muu vorm
rabauti's avatar
rabauti committed
477
478
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
			
rabauti's avatar
rabauti committed
479
480
			if (text1_lemmas == text2_lemmas and text1_forms == text2_forms and ' '.join(text1_word_texts).lower()==' '.join(text2_word_texts).lower() ):
				flags.append('suurväike')
rabauti's avatar
rabauti committed
481
				correction_sets.append( {'type':'suurväike',  'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
482
483
484
				finished =1
			elif (text1_lemmas == text2_lemmas and text1_forms == text2_forms and not text1_word_texts==text2_word_texts ):
				flags.append('paralleelvorm')
rabauti's avatar
rabauti committed
485
				correction_sets.append( {'type':'paralleelvorm',  'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
486
487
488
				finished =1
			
			else:
rabauti's avatar
rabauti committed
489
				
rabauti's avatar
rabauti committed
490
491
492
493
494
495
496
497
498
499
500
501
				resolved_pos = []
				
				casediff = 0
				worddiff = 0
				for token_pos in intersection(ndiff_data['added_pos'], ndiff_data['deleted_pos']):
					if text1_lemmas[token_pos] == text2_lemmas[token_pos] and text1_forms[token_pos] == text2_forms[token_pos]:
						if text1_word_texts[token_pos].lower() == text2_word_texts[token_pos].lower():
							casediff +=1
						else:
							worddiff +=1
						resolved_pos.append(token_pos)
				
rabauti's avatar
rabauti committed
502
503
				deleted_log = []
				added_log = []
rabauti's avatar
rabauti committed
504
505
				for ind in reversed(sorted(resolved_pos)):
					text1_lemmas.pop(ind)
rabauti's avatar
rabauti committed
506
					deleted_log.insert(0, text1_word_texts.pop(ind))
rabauti's avatar
rabauti committed
507
508
509
					text1_postags.pop(ind)
					text1_forms.pop(ind)
					text2_lemmas.pop(ind)
rabauti's avatar
rabauti committed
510
					added_log.insert(0, text2_word_texts.pop(ind))
rabauti's avatar
rabauti committed
511
512
513
					text2_postags.pop(ind)
					text2_forms.pop(ind)
				
rabauti's avatar
rabauti committed
514
515
516
517
518
519
520
521
				if len (resolved_pos):
					if casediff:
						flags.append('suurväike')
						correction_sets.append( {'type':'suurväike',  'added': added_log, 'deleted':deleted_log})
					if worddiff:
						flags.append('paralleelvorm')
						correction_sets.append( {'type':'paralleelvorm',  'added': added_log, 'deleted': deleted_log})
			
rabauti's avatar
rabauti committed
522
523
524
525
526
				new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
				ndiff_result_list_copy = list(new_ndiff_result)		
				ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
				if not ndiff_data['changed']:
					finished = 1
rabauti's avatar
rabauti committed
527
528
529
530
531
532
		
	
	
		###################
		#	valevorm
		####################
rabauti's avatar
rabauti committed
533
534
		if not finished:
			#sama sõna muu vorm
rabauti's avatar
rabauti committed
535
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
rabauti's avatar
rabauti committed
536
537
538
			#vaatas kogu teksti
			if (text1_lemmas == text2_lemmas and not text1_word_texts==text2_word_texts ):
				flags.append('valevorm')
rabauti's avatar
rabauti committed
539
				correction_sets.append( {'type':'valevorm',  'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
540
541
542
				finished =1
			#vaatame samal positsioonil asuvaid asendusi
			else:
rabauti's avatar
rabauti committed
543
				
rabauti's avatar
rabauti committed
544
				resolved_pos = []
rabauti's avatar
rabauti committed
545
				for token_pos in ndiff_data['pos_intersection']:
rabauti's avatar
rabauti committed
546
547
548
549
550
551
					if len(intersection(text1_lemmas[token_pos].split('|'), text2_lemmas[token_pos].split('|'))):
					# if text1_lemmas[token_pos] == text2_lemmas[token_pos]:
						resolved_pos.append(token_pos)
						
				
				
rabauti's avatar
rabauti committed
552
553
554
				
				deleted_log = []
				added_log = []
rabauti's avatar
rabauti committed
555
556
				for ind in reversed(sorted(resolved_pos)):
					text1_lemmas.pop(ind)
rabauti's avatar
rabauti committed
557
					deleted_log.insert(0, text1_word_texts.pop(ind))
rabauti's avatar
rabauti committed
558
559
560
					text1_postags.pop(ind)
					text1_forms.pop(ind)
					text2_lemmas.pop(ind)
rabauti's avatar
rabauti committed
561
					added_log.insert(0, text2_word_texts.pop(ind))
rabauti's avatar
rabauti committed
562
563
564
					text2_postags.pop(ind)
					text2_forms.pop(ind)
				
rabauti's avatar
rabauti committed
565
566
567
				if len (resolved_pos):
					flags.append('valevorm')
					correction_sets.append( {'type':'valevorm',  'added': added_log, 'deleted': deleted_log})
rabauti's avatar
rabauti committed
568
569
570
571
572
				new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
				ndiff_result_list_copy = list(new_ndiff_result)
				ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
				if not ndiff_data['changed']:
					finished = 1
rabauti's avatar
rabauti committed
573
574
575
576
577
578
		
		
	
		###################
		#	valelemma
		####################
rabauti's avatar
rabauti committed
579
580
		if not finished:
			#sama vorm muu sõna
rabauti's avatar
rabauti committed
581
582
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
			
rabauti's avatar
rabauti committed
583
584
			if (text1_forms == text2_forms and not text1_lemmas==text2_lemmas ):
				flags.append('valelemma')
rabauti's avatar
rabauti committed
585
				correction_sets.append( {'type':'valelemma',  'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
586
587
588
589
				finished =1
				
			#vaatame samal positsioonil asuvaid asendusi
			else:
rabauti's avatar
rabauti committed
590
				
rabauti's avatar
rabauti committed
591
				resolved_pos = []
rabauti's avatar
rabauti committed
592
				for token_pos in ndiff_data['pos_intersection']:
rabauti's avatar
rabauti committed
593
594
595
596
597
598
599
600
601
602
603
					sub_type = ''
					if text1_forms[token_pos] == text2_forms[token_pos] and not text1_lemmas[token_pos]==text2_lemmas[token_pos]:
						resolved_pos.append(token_pos)
					elif intersection(text1_forms[token_pos].split('|'), text2_forms[token_pos].split('|')) and not text1_lemmas[token_pos]==text2_lemmas[token_pos]:
						sub_type = '2'
						resolved_pos.append(token_pos)
					elif len(intersection(text1_forms[token_pos].split('|'), text2_forms[token_pos].split('|'))) and not len(intersection( text1_lemmas[token_pos].split('|'), text2_lemmas[token_pos].split('|'))):
						sub_type = '3'
						resolved_pos.append(token_pos)
					
				
rabauti's avatar
rabauti committed
604
					
rabauti's avatar
rabauti committed
605
				
rabauti's avatar
rabauti committed
606
607
				deleted_log = []
				added_log = []
rabauti's avatar
rabauti committed
608
609
				for ind in reversed(sorted(resolved_pos)):
					text1_lemmas.pop(ind)
rabauti's avatar
rabauti committed
610
					deleted_log.insert(0, text1_word_texts.pop(ind))
rabauti's avatar
rabauti committed
611
612
613
					text1_postags.pop(ind)
					text1_forms.pop(ind)
					text2_lemmas.pop(ind)
rabauti's avatar
rabauti committed
614
					added_log.insert(0, text2_word_texts.pop(ind))
rabauti's avatar
rabauti committed
615
616
617
					text2_postags.pop(ind)
					text2_forms.pop(ind)
				
rabauti's avatar
rabauti committed
618
619
620
				if len (resolved_pos):
					flags.append('valelemma')
					correction_sets.append( {'type':'valelemma'+sub_type,  'added': added_log, 'deleted': deleted_log})
rabauti's avatar
rabauti committed
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
				new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
				ndiff_result_list_copy = list(new_ndiff_result)
				ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
				if not ndiff_data['changed']:
					finished = 1
				
		
			
		if not finished:
			
			flags.append('0')
			unified_remained_diff_result = difflib.unified_diff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))

			html_diff_result = []
			for line in unified_remained_diff_result:
				if line.startswith('-'):
					html_diff_result.append('<span style="color:red">%s</span>' % line)
				elif line.startswith('+'):
					html_diff_result.append('<span style="color:green">%s</span>' % line)
				elif line.startswith('?'):
					html_diff_result.append('<span style="color:gray">%s</span>' % line)
				else:
					html_diff_result.append('%s' % line)
			
			corrections[uid][i]['remained_diff'] = html_diff_result
		
		
		if finished :
			corrections[uid][i]['remained_diff'] = ''
		
		
		
		# tokeniseertud originaal ja tokeniseerimata originaal ei erine omavahel
		# esimene, mida kontrollime on lisatud v eemaldatud kirjavahemärkide olemasolu
		#, kui leiame, et need on olemas, siis võtame diff tulemusest 
		
		ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
		
		if len(flags)==1 and flags[0] == '0' and ndiff_data['added_pos'] == ndiff_data['deleted_pos']:
			flags=['00']
		
		
		corrections[uid][i]['correction_sets'] = correction_sets
		
		corrections[uid][i]['flags'] = flags	
		corrections[uid][i]['flags_label'] = "_".join(sorted(flags))
		if corrections[uid][i]['flags_label'] == '0' or corrections[uid][i]['flags_label'] == '00':
			corrections[uid][i]['remained_diff'] = ''
		
		
		
		
		
		rows_html  = ''
		rows_html  += '<tr><td colspan="3">&nbsp;</td><td><a href="diff/%s.html" target="_blank">Võrdlus:</a></td>\n' %( correction['id'])

		rows_html  += '<tr><td><b>%s</b></td><td>%s</td><td>%s</td><td rowspan="2">%s</td></tr>\n' %( uid, originals[uid]['text'], originals[uid]['tokenized'], "<br/>".join(correction['html_diff_result']) )
		rows_html  += '<tr><td><b>%s</b></td><td>%s</td><td>%s</td></tr>\n' %(  correction['id'], correction['text'],  correction['tokenized'])

		rows_html  += '<tr><td colspan="4">&nbsp;</td></tr>\n'
		
		
		
		kkey = "_".join(sorted(flags))
		if not kkey in collected_flags:
			collected_flags[kkey] = 0
		collected_flags[kkey] += 1
		#print (flags, correction['text'])
rabauti's avatar
rabauti committed
689
		if '0' in flags and kkey in ('0', '00'):
rabauti's avatar
rabauti committed
690
691
			stats['lahendamata1'] +=1
		elif '0' in flags:
rabauti's avatar
rabauti committed
692
			stats['lahendamata1'] +=1
rabauti's avatar
rabauti committed
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
			stats['lahendamata2'] +=1
file_out.write('</table>')
file_out.write ('</body></html>') 
file_out.close()	
	
	
	
for label in collected_flags:
	#juba on olemas selline fail
	#if label == 'tundmatu': continue 
	
	print (label)
	filename2 = 'tulemus/%s.html' % label
	file_out = open(filename2, 'w')
	
	file_out.write( '<html> <meta charset="UTF-8"><head></head>') 
	file_out.write( """
	<style>

	table   {
		margin-top: 50px;
		width: 1000px;
		border-collapse: collapse;

		}
	table, th, td 
	{
		border: 1px solid black;
	
		vertical-align:top;
		padding: 5px;
	}
	</style>

	<body>
	""") 
	
	file_out.write( '<h1>%s (%d)</h1>' % (label,collected_flags[label]))
	file_out.write( '<h3>Kontrolliti %d parandust</h3>' % (stats['total']))
	file_out.write( '<h3>Viga tuvastatud: %d</h3>' % (stats['total']- stats['lahendamata1']-stats['lahendamata2']))
rabauti's avatar
rabauti committed
733
734
735
	file_out.write( '<h3>Viga tuvastamata: %d (osaliselt tuvastatud %d)</h3>' % (stats['lahendamata1'] + stats['lahendamata2'] , stats['lahendamata2']))
	
	file_out.write( '<h4>Kirjeldus <a target="_new" href="%s.html">#:</a></h4>')
rabauti's avatar
rabauti committed
736
737
738
739
740
741
	
	for label2 in sorted(collected_flags):
		file_out.write( '<span>%s <a href="%s.html">(%d)</a></span><br/>' % (label2,label2,collected_flags[label2]))
	
	
	file_out.write( '<table style="padding:5px">')
rabauti's avatar
rabauti committed
742
743


rabauti's avatar
rabauti committed
744
745
746
747
	for uid in sorted(corrections.keys()):
		for (i, correction) in enumerate(corrections[uid]):
			if not 'flags' in correction : continue
			if not len (correction['flags']): continue
rabauti's avatar
rabauti committed
748

rabauti's avatar
rabauti committed
749
			if not label ==  correction['flags_label']: continue
rabauti's avatar
rabauti committed
750

rabauti's avatar
rabauti committed
751
752
			rows_html  = ''
			rows_html  += '<tr><td>&nbsp;</td><td colspan="2">%s</td><td><a href="../morf/%s.html" target="_blank">Võrdlus:</a></td>\n' %( label, correction['id'])
rabauti's avatar
rabauti committed
753

rabauti's avatar
rabauti committed
754
755
			rows_html  += '<tr><td><b>%s</b></td><td>%s</td><td>%s</td><td rowspan="2">%s</td></tr>\n' %( uid, originals[uid]['text'], originals[uid]['tokenized'], "<br/>".join(correction['html_diff_result']) + '<hr/>' + "<br/>".join(correction['remained_diff']))
			rows_html  += '<tr><td><b>%s</b></td><td>%s</td><td>%s</td></tr>\n' %(  correction['id'], correction['text'],  correction['tokenized'])
rabauti's avatar
rabauti committed
756

rabauti's avatar
rabauti committed
757
			for corr_set in corrections[uid][i]['correction_sets']:
rabauti's avatar
rabauti committed
758

rabauti's avatar
rabauti committed
759
				rows_html  += '<tr><td>&nbsp;</td><td>%s</td><td><span style="color:red">%s</span> ---&gt; <span  style="color:green">%s</td><td></td></tr>\n' % (corr_set['type'],  ' '.join(corr_set['deleted']) , ' '.join(corr_set['added'] ) )
rabauti's avatar
rabauti committed
760

rabauti's avatar
rabauti committed
761
762
763
			rows_html  += '<tr><td colspan="4">&nbsp;</td></tr>\n'
			
			
rabauti's avatar
rabauti committed
764
765
766
767
768

	file_out.write(rows_html)



rabauti's avatar
rabauti committed
769
	file_out.write('</table>')
rabauti's avatar
rabauti committed
770

rabauti's avatar
rabauti committed
771
	
rabauti's avatar
rabauti committed
772
773
#paranduste klassifitseerimine
errorDecriptions = {
rabauti's avatar
rabauti committed
774
	
rabauti's avatar
rabauti committed
775
776
777
778
779
780
781
782
783
784
785
	'sõnadejärg' : { 
		'order':'1',
		'rows': [
				[ 'sõnadejärg', 'Sõna asukoht lauses on muutunud. Sõnakuju (va. väike-suurtäht lause alguses) pole muutunud. Sõnade morfanalüüsi ei vaadata.']
		] },

	'punktuatsioon' : { 
		'order':'2',
		'rows': [
				[ 'punktuatsioon', 'Lauses on muudetud (lisatud, eemaldatud, asendatud) mõnda kirjavahemärki.']
		]},
rabauti's avatar
rabauti committed
786
	
rabauti's avatar
rabauti committed
787
788
789
790
791
	'kokku-lahku' : { 
		'order':'3',
		'rows': [
				[ 'kokku-lahku', 'Lauses on muudetud (lisatud, eemaldatud, asendatud) mõnda kirjavahemärki.']
		]},
rabauti's avatar
rabauti committed
792
	
rabauti's avatar
rabauti committed
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
	'valelemma' : { 
		'order':'4',
		'rows': [
				[ 'valelemma', 'Sõnavorm on sama, lemma on erinev. (Mitmesuse puhul otsitakse ühisosa.) Vaadatakse originaallauses/paranduses samal positsioonil olevaid sõnu.']
		]},
	'valevorm' : { 
		'order':'5',
		'rows': [
				[ 'valevorm', 'Lemma on sama. Sõnavorm on erinev. (Mitmesuse puhul otsitakse ühisosa.) Vaadatakse originaallauses/paranduses samal positsioonil olevaid sõnu.']
		]},
	'parallelvorm' : { 
		'order':'5',
		'rows': [
				[ 'parallelvorm', 'Sõnakuju on erinev. Lemma on sama. Sõnavorm on sama. (Mitmesuse puhul otsitakse ühisosa.) Vaadatakse originaallauses/paranduses samal positsioonil olevaid sõnu.']
		]},
	'sõnalisatud' : { 
		'order':'5',
		'rows': [
				[ 'sõnalisatud', 'Lauses oli sõna puudu.']
		]},
	'sõnaüle' : { 
		'order':'5',
		'rows': [
				[ 'sõnaüle', 'Lauses  oli sõna üle.']
		]},
		
}


filename2 = 'tulemus/kirjeldus.html'
file_out = open(filename2, 'w')
file_out.write( '<html> <meta charset="UTF-8"><head></head>') 
file_out.write( "<body>") 
file_out.write( '<h1>Tüübid</h1>')

for key in sorted(errorDecriptions.keys()):
	#file_out.write( '<h2>%s</h2>' % key)
rabauti's avatar
rabauti committed
830
	
rabauti's avatar
rabauti committed
831
832
833
834
835
836
837
838
839
	for row in errorDecriptions[key]['rows']:
		file_out.write( '<p><b>%s</b><p><p style="padding-left:20px">%s</p>' % (row[0] , row[1]))
		
file_out.write ('</body></html>') 
file_out.close()


	

rabauti's avatar
rabauti committed
840
exit()