liigitus2.py 28.4 KB
Newer Older
rabauti's avatar
rabauti committed
1
2
3
4
5
6
7
8
9
#/usr/bin/python
# -*- coding: utf-8 -*-


import os
import sys
import re
import  difflib
import  copy
rabauti's avatar
rabauti committed
10
from  datetime import datetime
rabauti's avatar
rabauti committed
11
12

from estnltk import Text
rabauti's avatar
rabauti committed
13
from difflib import Differ
rabauti's avatar
rabauti committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106

#proovime mõned lihtsamad vead tuvastada
#Lähtefail ../korpus_tsv

#loeme kõik laused sisse, teeme dict

#item{'senetenceID'}['a'] = {'id': 'sentence'}
#item{'senetenceID'}['p'][0..n] = {'id': 'sentence'}



originals 	= {} 
corrections 	= {} 

script_name = (os.path.realpath(__file__))
script_dir = os.path.dirname(script_name)

def intersection(lst1, lst2):
	if not lst1 or not len(lst1): return []
	if not lst2 or not len(lst2): return []
	return list(set(lst1) & set(lst2))


def get_data_from_ndiff(ndiff_list):
	added = []
	deleted = []
	added_pos = []
	deleted_pos = []
	line_nr_new = -1
	line_nr_old = -1
	for line in ndiff_list:
		token = line.rstrip()[2:]

		if line.startswith('-'):
			line_nr_old += 1
			deleted.append(token)
			deleted_pos.append(line_nr_old)
			continue
				
		if line.startswith('+'):
			line_nr_new += 1
			added.append(token)
			added_pos.append(line_nr_new)
			continue
		
		if line.startswith('?'): continue
		
		line_nr_old += 1
		line_nr_new += 1
	changed = True
	if not len(added_pos) and not len(deleted_pos):
		changed = False
	
		
	return ({'added':added , 'added_pos':added_pos, 'deleted':deleted, 'deleted_pos':deleted_pos, 'changed':changed, 'pos_intersection':sorted(intersection(added_pos,deleted_pos ))})


filename = script_dir +'/../korpus_tsv/korpus.tsv'
#outdir = script_dir +'/korpus_xml'

file_input = open(filename, 'r')



for line in file_input:
	
	line = line.rstrip()
	line_arr = line.split('\t')
	
	if not len(line_arr) == 2:
		print ('ERROR :' , line)
		exit(1)
	
	uid = line_arr[0]
	text = line_arr[1]
	
	uid_arr = uid.split('_')
	uid_ending = uid_arr.pop()
	uid2 = '_'.join(uid_arr)
	el_type = uid_ending[0]
	
	if el_type == 'a':
		originals[uid2] = {'id': uid, 'text':text.strip()}
	
	if el_type == 'p':
		if not uid2 in corrections:
			corrections[uid2] = []
		corrections[uid2].append({'id': uid, 'text':text.strip()})

		
		

collected_flags = {}
rabauti's avatar
rabauti committed
107
108
109



rabauti's avatar
rabauti committed
110
111
112
113
114
115
116

# alustame parandustest
d = Differ()
d2 = difflib.HtmlDiff( tabsize=4, wrapcolumn=40 )

stats = {}
stats['total'] = 0
rabauti's avatar
rabauti committed
117
stats['lahendatud'] = 0
rabauti's avatar
rabauti committed
118
119
stats['lahendamata1'] = 0
stats['lahendamata2'] = 0
rabauti's avatar
rabauti committed
120

rabauti's avatar
rabauti committed
121

rabauti's avatar
rabauti committed
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
filename2 = 'tulemus/tundmatu.html'
file_out = open(filename2, 'w')

file_out.write( """<html> <meta charset="UTF-8"><head></head>
<style>
table   {
	margin-top: 50px;
	width: 1000px;
	border-collapse: collapse;
	}
table, th, td 
{
	border: 1px solid black;
	vertical-align:top;
	padding: 5px;
}
</style>
<body>
""") 

file_out.write( '<h1>%s</h1>' % ('Tuvastamata veaga'))
file_out.write( '<table style="padding:5px">')
linenr = 0
for uid in sorted(corrections.keys()):
	
	linenr +=1
rabauti's avatar
rabauti committed
148
	#if linenr > 100: continue
rabauti's avatar
rabauti committed
149
150
151
152
	to_print = 0


	for (i, correction) in enumerate(corrections[uid]):
rabauti's avatar
rabauti committed
153
154
155
		
		correction_sets = []
		
rabauti's avatar
rabauti committed
156
157
158
159
		stats['total'] += 1
		flags = []
		
		text1 = Text(originals[uid]['text'])
rabauti's avatar
rabauti committed
160
		originals[uid]['tokenized'] 		= ' '.join(text1.word_texts)
rabauti's avatar
rabauti committed
161
162
163
164

		text2 = Text(correction['text'])
		#text2_words = text2.word_texts
		
rabauti's avatar
rabauti committed
165
166
167
168
169
170
		
		corrections[uid][i]['tokenized'] 	= ' '.join(text2.word_texts)
		
		
		
		
rabauti's avatar
rabauti committed
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
		ndiff_result = difflib.ndiff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1))
		ndiff_result_list = list(ndiff_result)
		unified_diff_result = difflib.unified_diff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1))

		html_diff_result = []
		for line in unified_diff_result:
			if line.startswith('-'):
				html_diff_result.append('<span style="color:red">%s</span>' % line)
			elif line.startswith('+'):
				html_diff_result.append('<span style="color:green">%s</span>' % line)
			elif line.startswith('?'):
				html_diff_result.append('<span style="color:gray">%s</span>' % line)
			else:
				html_diff_result.append('%s' % line)
		corrections[uid][i]['unified_diff_result'] = list(unified_diff_result)		
		corrections[uid][i]['html_diff_result'] = html_diff_result
		
		
		#algsed tööks vajalikud massiivid
		#text1
		#text2 
		
		#ndiff_result
		#unified_diff_result
		#et originaal säiliks töötame edasi koopiatega
		
		#finished
		
		finished = 0
		text1_copy  = Text(originals[uid]['text'])
		text2_copy= Text(correction['text'])
		
		text1_lemmas = text1_copy.lemmas
		text2_lemmas = text2_copy.lemmas
		
		
		text1_word_texts = text1_copy.word_texts
		text2_word_texts = text2_copy.word_texts
		
		text1_postags = text1_copy.postags
		text2_postags = text2_copy.postags
	
		text1_forms = text1_copy.forms
		text2_forms = text2_copy.forms
	
	
rabauti's avatar
rabauti committed
217
218
219
		#lisame lõppu nn saba, muidu viimase sõna diff võib valeks minna 
		text1_lemmas.append('####')
		text2_lemmas.append('####')
rabauti's avatar
rabauti committed
220
		
rabauti's avatar
rabauti committed
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
		text1_word_texts.append('####')
		text2_word_texts.append('####')
		
		text1_postags.append('####')
		text2_postags.append('####')
	
		text1_forms.append('####')
		text2_forms.append('####')
	
	
		
	
		
		ndiff_result = difflib.ndiff("\n".join(text1.word_texts).splitlines(1), "\n".join(text2.word_texts).splitlines(1))
		ndiff_result_list = list(ndiff_result)
rabauti's avatar
rabauti committed
236
237
238
239
240
241
242
243
244
		ndiff_result_list_copy = copy.copy(ndiff_result_list)
		
		added = []
		deleted = []
		
		added_pos = []
		deleted_pos = []
		
		
rabauti's avatar
rabauti committed
245
		#print (ndiff_result_list)
rabauti's avatar
rabauti committed
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
		
		# esiteks lihtne kontroll, et tokeniseerimata originaal ja tokeniseerimata parandus ei erine omavahel
		if originals[uid]['text'] == correction['text']:
			flags.append('puudub')
			finished = 1
		if not finished and originals[uid]['tokenized'] == correction['tokenized']:
			flags.append('tühik')
			finished = 1
		
		#vaatame, kas kirjavahemärkidega tehti midagi
		if not finished:
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
			added_pos = [text1_postags[index] for index in ndiff_data['deleted_pos']]
			deleted_pos = [text2_postags[index] for index in ndiff_data['added_pos']]
			
			added_pos_uniq = list(set([text1_postags[index] for index in ndiff_data['deleted_pos']]))
			deleted_pos_uniq = list(set([text2_postags[index] for index in ndiff_data['added_pos']]))
			
			#print ('added_pos_uniq', added_pos_uniq)
			#print ('deleted_pos_uniq', deleted_pos_uniq)
			rowsets = {}
			#muudatused on seatud ainult kirjavahemärkidega
			if ''.join(added_pos_uniq) in ['Z', ''] and ''.join(deleted_pos_uniq) in ['Z', '']:
				flags.append('punktuatsioon')
rabauti's avatar
rabauti committed
270
				correction_sets.append( {'type':'punktuatsioon', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
271
272
273
274
275
276
277
278
				
				
				finished = 1
			#muudatused on ka kirjavahemärkidega
			elif 'Z' in added_pos + deleted_pos:
				flags.append('punktuatsioon')
				
				#teeme siin sellise sammu, kus eemaldame kõik kirjavahemärgid ja saame uuesti võrrelda
rabauti's avatar
rabauti committed
279
				deleted_log = []
rabauti's avatar
rabauti committed
280
281
282
283
				for (ind, pos) in reversed(list( enumerate(text1_postags))):
					
					if pos == 'Z':
						text1_lemmas.pop(ind)
rabauti's avatar
rabauti committed
284
						deleted_log.insert(0, text1_word_texts.pop(ind))
rabauti's avatar
rabauti committed
285
286
287
288
						text1_postags.pop(ind)
						text1_forms.pop(ind)
						
				removed = []
rabauti's avatar
rabauti committed
289
				added_log = []
rabauti's avatar
rabauti committed
290
291
292
				for (ind, pos) in reversed(list( enumerate(text2_postags))):
					if pos == 'Z':
						text2_lemmas.pop(ind)
rabauti's avatar
rabauti committed
293
						added_log.insert(0, text2_word_texts.pop(ind))
rabauti's avatar
rabauti committed
294
295
296
						text2_postags.pop(ind)
						text2_forms.pop(ind)
				
rabauti's avatar
rabauti committed
297
				correction_sets.append( {'type':'punktuatsioon', 'added': added_log, 'deleted': deleted_log })
rabauti's avatar
rabauti committed
298
299
300
301
302
303
304
				new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
				#print ("+++++++")
				#print ("".join(ndiff_result_list_copy))
				ndiff_result_list_copy = list(new_ndiff_result)
				if not len(ndiff_result_list_copy):
					finished = 1
				#print ("".join(ndiff_result_list_copy))
rabauti's avatar
rabauti committed
305
306
307
308
		
		###########################
		# sõnaasukoht
		###########################
rabauti's avatar
rabauti committed
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
		if not finished:
			#sõnade järjekorra kontrollimine
			# kui sõna on lause alguses, siis võrdleme seda lowercase
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
			
			#keerame esimesed sõnad lowercase
			if 0 in ndiff_data['added_pos']:
				#print (ndiff_data['added'][0])
				ndiff_data['added'][0] = ndiff_data['added'][0].lower()
				#print (ndiff_data['added'][0])
				
			if 0 in ndiff_data['deleted_pos']:
				#print (ndiff_data['deleted'][0])
				ndiff_data['deleted'][0] = ndiff_data['deleted'][0].lower()
				#print (ndiff_data['deleted'][0])
rabauti's avatar
rabauti committed
324
			
rabauti's avatar
rabauti committed
325
			if (sorted(ndiff_data['added']) == sorted(ndiff_data['deleted'])):
rabauti's avatar
rabauti committed
326
327
				flags.append('sõnaasukoht')
				correction_sets.append( {'type':'sõnaasukoht', 'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
328
329
330
331
				
				finished = 1
				
			elif intersection(ndiff_data['added'], ndiff_data['deleted']):
rabauti's avatar
rabauti committed
332
				flags.append('sõnaasukoht')
rabauti's avatar
rabauti committed
333
334
335
336
337
338
339
340
				
				deleted_elements_ind = []
				added_elements_ind = []
				#kustutame ära need sõnad, mis vahetasid asukohta
				for token in intersection(ndiff_data['added'], ndiff_data['deleted']):
					#kustutame ainult esimese esinemise
					deleted_elements_ind.append(ndiff_data['deleted_pos'][ndiff_data['deleted'].index(token)])
					added_elements_ind.append(ndiff_data['added_pos'][ndiff_data['added'].index(token)])
rabauti's avatar
rabauti committed
341
342
				
				
rabauti's avatar
rabauti committed
343
					
rabauti's avatar
rabauti committed
344
				deleted_log = []
rabauti's avatar
rabauti committed
345
346
				for ind in reversed(sorted(deleted_elements_ind)):
					text1_lemmas.pop(ind)
rabauti's avatar
rabauti committed
347
					deleted_log.insert(0, text1_word_texts.pop(ind))
rabauti's avatar
rabauti committed
348
349
350
					text1_postags.pop(ind)
					text1_forms.pop(ind)

rabauti's avatar
rabauti committed
351
				added_log = []
rabauti's avatar
rabauti committed
352
353
				for ind in reversed(sorted(added_elements_ind)):
					text2_lemmas.pop(ind)
rabauti's avatar
rabauti committed
354
					added_log.insert(0, text2_word_texts.pop(ind))
rabauti's avatar
rabauti committed
355
356
357
					text2_postags.pop(ind)
					text2_forms.pop(ind)
			
rabauti's avatar
rabauti committed
358
				correction_sets.append( {'type':'sõnaasukoht', 'added':  added_log, 'deleted': deleted_log })
rabauti's avatar
rabauti committed
359
				new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
rabauti's avatar
rabauti committed
360
			
rabauti's avatar
rabauti committed
361
362
363
364
365
366
367
368
369
370
371
372
373
				#print ("+++++++")
				#print ("".join(ndiff_result_list_copy))
				ndiff_result_list_copy = list(new_ndiff_result)
				ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
				if not ndiff_data['changed']:
					finished = 1
				#print ("".join(ndiff_result_list_copy))
			
			
		if not finished:
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
			if not len(ndiff_data['added']):
				flags.append('sõnaüle')
rabauti's avatar
rabauti committed
374
				correction_sets.append( {'type':'sõnaüle', 'added':  [], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
375
376
377
				finished =1
			elif not len(ndiff_data['deleted']):
				flags.append('sõnapuudu')
rabauti's avatar
rabauti committed
378
				correction_sets.append( {'type':'sõnapuudu', 'added':  ndiff_data['added'], 'deleted': [] })
rabauti's avatar
rabauti committed
379
380
				finished =1
		
rabauti's avatar
rabauti committed
381
382
383
384
		
		##################
		#	kokku-lahku 
		##################
rabauti's avatar
rabauti committed
385
386
		if not finished:
			
rabauti's avatar
rabauti committed
387
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
rabauti's avatar
rabauti committed
388
389
			if "".join(text1_word_texts) == "".join(text2_word_texts):
				flags.append('kokku-lahku')
rabauti's avatar
rabauti committed
390
				correction_sets.append( {'type':'kokku-lahku',  'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
391
392
393
394
				finished =1
			else:
				#otsin lisatud sõnade seas sõnu, mis on järjest positsioonidel ja mille kokkuliitmisel saab mõne kustutatud sõna
				#otsin kustutatud sõnade seas sõnu, mis on järjest positsioonidel ja mille kokkuliitmisel saab mõne lisatud sõna
rabauti's avatar
rabauti committed
395
				
rabauti's avatar
rabauti committed
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
				#added_pos = [text1_postags[index] for index in ndiff_data['deleted_pos']]
				#deleted_pos = [text2_postags[index] for index in ndiff_data['added_pos']]
				
				joined = {}
				
				
				text1_word_texts_old  = []
				
				#kokku liidetud sõnad
				while not text1_word_texts == text1_word_texts_old:
					
					new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
					ndiff_result_list_copy = list(new_ndiff_result)
					text1_word_texts_old = copy.copy(text1_word_texts)
					ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
					
					
					remove_added = []
					remove_deleted = []
					
					for pos in reversed(ndiff_data['deleted_pos']):
						ind = ndiff_data['deleted_pos'].index(pos)
						if pos+1 in ndiff_data['deleted_pos']:

							joinedword = ndiff_data['deleted'][ind] + ndiff_data['deleted'][ind+1]
							if joinedword in ndiff_data['added']:
								#print (ndiff_data['deleted'], ndiff_data['added'])
								remove_added.append(ndiff_data['added_pos'][ndiff_data['added'].index(joinedword)])
								remove_deleted.append(pos)
								remove_deleted.append(pos+1)
								#print (remove_deleted, remove_added)
							break
					
					if len(remove_deleted):
						flags.append('kokku-lahku')
rabauti's avatar
rabauti committed
431
432
						
						deleted_log = []
rabauti's avatar
rabauti committed
433
						for ind in reversed(sorted(remove_deleted)):
rabauti's avatar
rabauti committed
434
							
rabauti's avatar
rabauti committed
435
							text1_lemmas.pop(ind)
rabauti's avatar
rabauti committed
436
							deleted_log.insert(0, text1_word_texts.pop(ind))
rabauti's avatar
rabauti committed
437
438
							text1_postags.pop(ind)
							text1_forms.pop(ind)
rabauti's avatar
rabauti committed
439
440
							
						added_log = []
rabauti's avatar
rabauti committed
441
442
						for ind in reversed(sorted(remove_added)):	
							text2_lemmas.pop(ind)
rabauti's avatar
rabauti committed
443
							added_log.insert(0, text2_word_texts.pop(ind))
rabauti's avatar
rabauti committed
444
445
							text2_postags.pop(ind)
							text2_forms.pop(ind)
rabauti's avatar
rabauti committed
446
447
448
						correction_sets.append( {'type':'kokku-lahku', 'added': added_log, 'deleted': deleted_log })		
						
							
rabauti's avatar
rabauti committed
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
							
					text1_word_texts_old = copy.copy(text1_word_texts)
				
				text1_word_texts_old  = []
				#lahku tõstetud sõnad
				while not text1_word_texts == text1_word_texts_old:
					
					new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
					ndiff_result_list_copy = list(new_ndiff_result)
					text1_word_texts_old = copy.copy(text1_word_texts)
					ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
					
					
					remove_added = []
					remove_deleted = []
					for pos in reversed(ndiff_data['added_pos']):
						ind = ndiff_data['added_pos'].index(pos)
						if pos+1 in ndiff_data['added_pos']:

							joinedword = ndiff_data['added'][ind] + ndiff_data['added'][ind+1]
							if joinedword in ndiff_data['deleted']:
								#print (ndiff_data['added'], ndiff_data['deleted'])
								remove_deleted.append(ndiff_data['deleted_pos'][ndiff_data['deleted'].index(joinedword)])
								remove_added.append(pos)
								remove_added.append(pos+1)
								#print (remove_added, remove_deleted)
							break
					
					if len(remove_deleted):
						flags.append('kokku-lahku')
rabauti's avatar
rabauti committed
479
						deleted_log = []
rabauti's avatar
rabauti committed
480
481
						for ind in reversed(sorted(remove_deleted)):
							text1_lemmas.pop(ind)
rabauti's avatar
rabauti committed
482
							deleted_log.insert(0, text1_word_texts.pop(ind))
rabauti's avatar
rabauti committed
483
484
							text1_postags.pop(ind)
							text1_forms.pop(ind)
rabauti's avatar
rabauti committed
485
						added_log = []
rabauti's avatar
rabauti committed
486
487
						for ind in reversed(sorted(remove_added)):	
							text2_lemmas.pop(ind)
rabauti's avatar
rabauti committed
488
							added_log.insert(0, text2_word_texts.pop(ind))
rabauti's avatar
rabauti committed
489
490
							text2_postags.pop(ind)
							text2_forms.pop(ind)
rabauti's avatar
rabauti committed
491
						correction_sets.append( {'type':'kokku-lahku', 'added': added_log, 'deleted': deleted_log })
rabauti's avatar
rabauti committed
492
493
494
495
496
497
498
499
					text1_word_texts_old = copy.copy(text1_word_texts)
					
				new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
				ndiff_result_list_copy = list(new_ndiff_result)
				ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
				if not ndiff_data['changed']:
					finished = 1
		
rabauti's avatar
rabauti committed
500
501
502
503
		###############
		#	suurväike
		#	paralleelvorm
		###############
rabauti's avatar
rabauti committed
504
505
		if not finished:
			#sama sõna muu vorm
rabauti's avatar
rabauti committed
506
507
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
			
rabauti's avatar
rabauti committed
508
509
			if (text1_lemmas == text2_lemmas and text1_forms == text2_forms and ' '.join(text1_word_texts).lower()==' '.join(text2_word_texts).lower() ):
				flags.append('suurväike')
rabauti's avatar
rabauti committed
510
				correction_sets.append( {'type':'suurväike',  'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
511
512
513
				finished =1
			elif (text1_lemmas == text2_lemmas and text1_forms == text2_forms and not text1_word_texts==text2_word_texts ):
				flags.append('paralleelvorm')
rabauti's avatar
rabauti committed
514
				correction_sets.append( {'type':'paralleelvorm',  'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
515
516
517
				finished =1
			
			else:
rabauti's avatar
rabauti committed
518
				
rabauti's avatar
rabauti committed
519
520
521
522
				resolved_pos = []
				
				casediff = 0
				worddiff = 0
rabauti's avatar
rabauti committed
523
				for token_pos in ndiff_data['pos_intersection']:
rabauti's avatar
rabauti committed
524
525
526
527
528
529
530
					if text1_lemmas[token_pos] == text2_lemmas[token_pos] and text1_forms[token_pos] == text2_forms[token_pos]:
						if text1_word_texts[token_pos].lower() == text2_word_texts[token_pos].lower():
							casediff +=1
						else:
							worddiff +=1
						resolved_pos.append(token_pos)
				
rabauti's avatar
rabauti committed
531
532
533
534
				deleted_log_worddiff = []
				added_log_worddiff = []
				deleted_log_casediff = []
				added_log_casediff = []
rabauti's avatar
rabauti committed
535
536
				for ind in reversed(sorted(resolved_pos)):
					text1_lemmas.pop(ind)
rabauti's avatar
rabauti committed
537
538
539
540
541
542
543
544
					if text1_word_texts[ind].lower() == text2_word_texts[ind].lower():
						deleted_log_casediff.insert(0, text1_word_texts.pop(ind))
						added_log_casediff.insert(0, text2_word_texts.pop(ind))
					else:
						deleted_log_worddiff.insert(0, text1_word_texts.pop(ind))
						added_log_worddiff.insert(0, text2_word_texts.pop(ind))
						
						
rabauti's avatar
rabauti committed
545
546
547
					text1_postags.pop(ind)
					text1_forms.pop(ind)
					text2_lemmas.pop(ind)
rabauti's avatar
rabauti committed
548
					
rabauti's avatar
rabauti committed
549
550
551
					text2_postags.pop(ind)
					text2_forms.pop(ind)
				
rabauti's avatar
rabauti committed
552
				if len (resolved_pos):
rabauti's avatar
rabauti committed
553
					if len(added_log_casediff):
rabauti's avatar
rabauti committed
554
						flags.append('suurväike')
rabauti's avatar
rabauti committed
555
556
557
558
559
						correction_sets.append( {'type':'suurväike',  'added': added_log_casediff, 'deleted':deleted_log_casediff})
						#print (ndiff_result_list_copy)
						#print (added_log_casediff)
						
					if len(deleted_log_worddiff):
rabauti's avatar
rabauti committed
560
						flags.append('paralleelvorm')
rabauti's avatar
rabauti committed
561
						correction_sets.append( {'type':'paralleelvorm',  'added': added_log_worddiff, 'deleted': deleted_log_worddiff})
rabauti's avatar
rabauti committed
562
			
rabauti's avatar
rabauti committed
563
564
565
566
567
				new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
				ndiff_result_list_copy = list(new_ndiff_result)		
				ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
				if not ndiff_data['changed']:
					finished = 1
rabauti's avatar
rabauti committed
568
569
570
571
572
573
		
	
	
		###################
		#	valevorm
		####################
rabauti's avatar
rabauti committed
574
575
		if not finished:
			#sama sõna muu vorm
rabauti's avatar
rabauti committed
576
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
rabauti's avatar
rabauti committed
577
578
579
			#vaatas kogu teksti
			if (text1_lemmas == text2_lemmas and not text1_word_texts==text2_word_texts ):
				flags.append('valevorm')
rabauti's avatar
rabauti committed
580
				correction_sets.append( {'type':'valevorm',  'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
581
582
583
				finished =1
			#vaatame samal positsioonil asuvaid asendusi
			else:
rabauti's avatar
rabauti committed
584
				
rabauti's avatar
rabauti committed
585
				resolved_pos = []
rabauti's avatar
rabauti committed
586
				for token_pos in ndiff_data['pos_intersection']:
rabauti's avatar
rabauti committed
587
588
589
590
591
592
					if len(intersection(text1_lemmas[token_pos].split('|'), text2_lemmas[token_pos].split('|'))):
					# if text1_lemmas[token_pos] == text2_lemmas[token_pos]:
						resolved_pos.append(token_pos)
						
				
				
rabauti's avatar
rabauti committed
593
594
595
				
				deleted_log = []
				added_log = []
rabauti's avatar
rabauti committed
596
597
				for ind in reversed(sorted(resolved_pos)):
					text1_lemmas.pop(ind)
rabauti's avatar
rabauti committed
598
					deleted_log.insert(0, text1_word_texts.pop(ind))
rabauti's avatar
rabauti committed
599
600
601
					text1_postags.pop(ind)
					text1_forms.pop(ind)
					text2_lemmas.pop(ind)
rabauti's avatar
rabauti committed
602
					added_log.insert(0, text2_word_texts.pop(ind))
rabauti's avatar
rabauti committed
603
604
605
					text2_postags.pop(ind)
					text2_forms.pop(ind)
				
rabauti's avatar
rabauti committed
606
607
608
				if len (resolved_pos):
					flags.append('valevorm')
					correction_sets.append( {'type':'valevorm',  'added': added_log, 'deleted': deleted_log})
rabauti's avatar
rabauti committed
609
610
611
612
613
				new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
				ndiff_result_list_copy = list(new_ndiff_result)
				ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
				if not ndiff_data['changed']:
					finished = 1
rabauti's avatar
rabauti committed
614
615
616
617
618
619
		
		
	
		###################
		#	valelemma
		####################
rabauti's avatar
rabauti committed
620
621
		if not finished:
			#sama vorm muu sõna
rabauti's avatar
rabauti committed
622
623
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
			
rabauti's avatar
rabauti committed
624
625
			if (text1_forms == text2_forms and not text1_lemmas==text2_lemmas ):
				flags.append('valelemma')
rabauti's avatar
rabauti committed
626
				correction_sets.append( {'type':'valelemma',  'added': ndiff_data['added'], 'deleted': ndiff_data['deleted'] })
rabauti's avatar
rabauti committed
627
628
629
630
				finished =1
				
			#vaatame samal positsioonil asuvaid asendusi
			else:
rabauti's avatar
rabauti committed
631
				
rabauti's avatar
rabauti committed
632
				resolved_pos = []
rabauti's avatar
rabauti committed
633
				for token_pos in ndiff_data['pos_intersection']:
rabauti's avatar
rabauti committed
634
635
636
637
638
639
640
641
642
643
644
					sub_type = ''
					if text1_forms[token_pos] == text2_forms[token_pos] and not text1_lemmas[token_pos]==text2_lemmas[token_pos]:
						resolved_pos.append(token_pos)
					elif intersection(text1_forms[token_pos].split('|'), text2_forms[token_pos].split('|')) and not text1_lemmas[token_pos]==text2_lemmas[token_pos]:
						sub_type = '2'
						resolved_pos.append(token_pos)
					elif len(intersection(text1_forms[token_pos].split('|'), text2_forms[token_pos].split('|'))) and not len(intersection( text1_lemmas[token_pos].split('|'), text2_lemmas[token_pos].split('|'))):
						sub_type = '3'
						resolved_pos.append(token_pos)
					
				
rabauti's avatar
rabauti committed
645
					
rabauti's avatar
rabauti committed
646
				
rabauti's avatar
rabauti committed
647
648
				deleted_log = []
				added_log = []
rabauti's avatar
rabauti committed
649
650
				for ind in reversed(sorted(resolved_pos)):
					text1_lemmas.pop(ind)
rabauti's avatar
rabauti committed
651
					deleted_log.insert(0, text1_word_texts.pop(ind))
rabauti's avatar
rabauti committed
652
653
654
					text1_postags.pop(ind)
					text1_forms.pop(ind)
					text2_lemmas.pop(ind)
rabauti's avatar
rabauti committed
655
					added_log.insert(0, text2_word_texts.pop(ind))
rabauti's avatar
rabauti committed
656
657
658
					text2_postags.pop(ind)
					text2_forms.pop(ind)
				
rabauti's avatar
rabauti committed
659
660
661
				if len (resolved_pos):
					flags.append('valelemma')
					correction_sets.append( {'type':'valelemma'+sub_type,  'added': added_log, 'deleted': deleted_log})
rabauti's avatar
rabauti committed
662
663
664
665
666
667
				new_ndiff_result = difflib.ndiff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
				ndiff_result_list_copy = list(new_ndiff_result)
				ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
				if not ndiff_data['changed']:
					finished = 1
				
rabauti's avatar
rabauti committed
668
669
670
671
672
673
674
675
676
677
678
679
				
		if not finished:
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
			if not len(ndiff_data['added']):
				flags.append('sõnaüle')
				correction_sets.append( {'type':'sõnaüle', 'added':  [], 'deleted': ndiff_data['deleted'] })
				finished =1
			elif not len(ndiff_data['deleted']):
				flags.append('sõnapuudu')
				correction_sets.append( {'type':'sõnapuudu', 'added':  ndiff_data['added'], 'deleted': [] })
				finished =1
	
rabauti's avatar
rabauti committed
680
681
682
683
684
			
		if not finished:
			
			flags.append('0')
			unified_remained_diff_result = difflib.unified_diff("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
rabauti's avatar
rabauti committed
685
686
			
			
rabauti's avatar
rabauti committed
687
688
689
690
691
692
693
694
695
696
			html_diff_result = []
			for line in unified_remained_diff_result:
				if line.startswith('-'):
					html_diff_result.append('<span style="color:red">%s</span>' % line)
				elif line.startswith('+'):
					html_diff_result.append('<span style="color:green">%s</span>' % line)
				elif line.startswith('?'):
					html_diff_result.append('<span style="color:gray">%s</span>' % line)
				else:
					html_diff_result.append('%s' % line)
rabauti's avatar
rabauti committed
697
698
			ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
			correction_sets.append( {'type':'lahendamata',  'added':ndiff_data['added'], 'deleted': ndiff_data['deleted']})
rabauti's avatar
rabauti committed
699
700
701
702
703
704
			
			corrections[uid][i]['remained_diff'] = html_diff_result
		
		
		if finished :
			corrections[uid][i]['remained_diff'] = ''
rabauti's avatar
rabauti committed
705
			stats['lahendatud'] += 1
rabauti's avatar
rabauti committed
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
		
		
		# tokeniseertud originaal ja tokeniseerimata originaal ei erine omavahel
		# esimene, mida kontrollime on lisatud v eemaldatud kirjavahemärkide olemasolu
		#, kui leiame, et need on olemas, siis võtame diff tulemusest 
		
		ndiff_data = get_data_from_ndiff(ndiff_result_list_copy)
		
		if len(flags)==1 and flags[0] == '0' and ndiff_data['added_pos'] == ndiff_data['deleted_pos']:
			flags=['00']
		
		
		corrections[uid][i]['correction_sets'] = correction_sets
		
		corrections[uid][i]['flags'] = flags	
		corrections[uid][i]['flags_label'] = "_".join(sorted(flags))
		if corrections[uid][i]['flags_label'] == '0' or corrections[uid][i]['flags_label'] == '00':
			corrections[uid][i]['remained_diff'] = ''
		
		
		
		
		
		rows_html  = ''
		rows_html  += '<tr><td colspan="3">&nbsp;</td><td><a href="diff/%s.html" target="_blank">Võrdlus:</a></td>\n' %( correction['id'])

		rows_html  += '<tr><td><b>%s</b></td><td>%s</td><td>%s</td><td rowspan="2">%s</td></tr>\n' %( uid, originals[uid]['text'], originals[uid]['tokenized'], "<br/>".join(correction['html_diff_result']) )
		rows_html  += '<tr><td><b>%s</b></td><td>%s</td><td>%s</td></tr>\n' %(  correction['id'], correction['text'],  correction['tokenized'])

		rows_html  += '<tr><td colspan="4">&nbsp;</td></tr>\n'
		
		
		
		kkey = "_".join(sorted(flags))
		if not kkey in collected_flags:
			collected_flags[kkey] = 0
		collected_flags[kkey] += 1
		#print (flags, correction['text'])
rabauti's avatar
rabauti committed
744
		if '0' in flags and kkey in ('0', '00'):
rabauti's avatar
rabauti committed
745
746
747
			stats['lahendamata1'] +=1
		elif '0' in flags:
			stats['lahendamata2'] +=1
rabauti's avatar
rabauti committed
748
		
rabauti's avatar
rabauti committed
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
file_out.write('</table>')
file_out.write ('</body></html>') 
file_out.close()	
	
	
	
for label in collected_flags:
	#juba on olemas selline fail
	#if label == 'tundmatu': continue 
	
	print (label)
	filename2 = 'tulemus/%s.html' % label
	file_out = open(filename2, 'w')
	
	file_out.write( '<html> <meta charset="UTF-8"><head></head>') 
	file_out.write( """
	<style>

	table   {
		margin-top: 50px;
		width: 1000px;
		border-collapse: collapse;

		}
	table, th, td 
	{
		border: 1px solid black;
	
		vertical-align:top;
		padding: 5px;
	}
	</style>

	<body>
	""") 
	
rabauti's avatar
rabauti committed
785
	file_out.write( '<h5>%s</h5>' % str(datetime.now()))
rabauti's avatar
rabauti committed
786
787
	file_out.write( '<h1>%s (%d)</h1>' % (label,collected_flags[label]))
	file_out.write( '<h3>Kontrolliti %d parandust</h3>' % (stats['total']))
rabauti's avatar
rabauti committed
788
	file_out.write( '<h3>Viga tuvastatud: %d</h3>' % (stats['lahendatud']))
rabauti's avatar
rabauti committed
789
790
	file_out.write( '<h3>Viga tuvastamata: %d (osaliselt tuvastatud %d)</h3>' % (stats['lahendamata1'] + stats['lahendamata2'] , stats['lahendamata2']))
	
rabauti's avatar
rabauti committed
791
	file_out.write( '<h4>Kirjeldus <a target="_new" href="kirjeldus.html">#:</a></h4>')
rabauti's avatar
rabauti committed
792
793
794
795
796
797
	
	for label2 in sorted(collected_flags):
		file_out.write( '<span>%s <a href="%s.html">(%d)</a></span><br/>' % (label2,label2,collected_flags[label2]))
	
	
	file_out.write( '<table style="padding:5px">')
rabauti's avatar
rabauti committed
798
799


rabauti's avatar
rabauti committed
800
801
802
803
	for uid in sorted(corrections.keys()):
		for (i, correction) in enumerate(corrections[uid]):
			if not 'flags' in correction : continue
			if not len (correction['flags']): continue
rabauti's avatar
rabauti committed
804

rabauti's avatar
rabauti committed
805
			if not label ==  correction['flags_label']: continue
rabauti's avatar
rabauti committed
806

rabauti's avatar
rabauti committed
807
808
			rows_html  = ''
			rows_html  += '<tr><td>&nbsp;</td><td colspan="2">%s</td><td><a href="../morf/%s.html" target="_blank">Võrdlus:</a></td>\n' %( label, correction['id'])
rabauti's avatar
rabauti committed
809

rabauti's avatar
rabauti committed
810
811
			rows_html  += '<tr><td><b>%s</b></td><td>%s</td><td>%s</td><td rowspan="2">%s</td></tr>\n' %( uid, originals[uid]['text'], originals[uid]['tokenized'], "<br/>".join(correction['html_diff_result']) + '<hr/>' + "<br/>".join(correction['remained_diff']))
			rows_html  += '<tr><td><b>%s</b></td><td>%s</td><td>%s</td></tr>\n' %(  correction['id'], correction['text'],  correction['tokenized'])
rabauti's avatar
rabauti committed
812

rabauti's avatar
rabauti committed
813
			for corr_set in corrections[uid][i]['correction_sets']:
rabauti's avatar
rabauti committed
814

rabauti's avatar
rabauti committed
815
				rows_html  += '<tr><td>&nbsp;</td><td>%s</td><td><span style="color:red">%s</span> ---&gt; <span  style="color:green">%s</td><td></td></tr>\n' % (corr_set['type'],  ' '.join(corr_set['deleted']) , ' '.join(corr_set['added'] ) )
rabauti's avatar
rabauti committed
816

rabauti's avatar
rabauti committed
817
818
819
			rows_html  += '<tr><td colspan="4">&nbsp;</td></tr>\n'
			
			
rabauti's avatar
rabauti committed
820

rabauti's avatar
rabauti committed
821
			file_out.write(rows_html)
rabauti's avatar
rabauti committed
822
823
824



rabauti's avatar
rabauti committed
825
	file_out.write('</table>')
rabauti's avatar
rabauti committed
826

rabauti's avatar
rabauti committed
827
	
rabauti's avatar
rabauti committed
828
829
#paranduste klassifitseerimine
errorDecriptions = {
rabauti's avatar
rabauti committed
830
	
rabauti's avatar
rabauti committed
831
832
833
834
835
836
837
	'0' : { 
		'order':'1',
		'rows': [
				[ '0', 'Tuvastamata.'],
				[ '00', 'Tuvastamata. Parandused ja eksimused on lausetes kohakuti. ']
		] },
	'sõnaasukoht' : { 
rabauti's avatar
rabauti committed
838
839
		'order':'1',
		'rows': [
rabauti's avatar
rabauti committed
840
841
842
843
844
845
846
				[ 'sõnaasukoht', 'Sõna asukoht lauses on muutunud. Sõnakuju (va. väike-suurtäht lause alguses) pole muutunud. Sõnade morfanalüüsi ei vaadata.']
		] },

	'suur-väike' : { 
		'order':'1',
		'rows': [
				[ 'suur-väike', 'Sõnas muutusid suurtähed v väiketähed.']
rabauti's avatar
rabauti committed
847
848
849
850
851
852
853
		] },

	'punktuatsioon' : { 
		'order':'2',
		'rows': [
				[ 'punktuatsioon', 'Lauses on muudetud (lisatud, eemaldatud, asendatud) mõnda kirjavahemärki.']
		]},
rabauti's avatar
rabauti committed
854
	
rabauti's avatar
rabauti committed
855
856
857
	'kokku-lahku' : { 
		'order':'3',
		'rows': [
rabauti's avatar
rabauti committed
858
				[ 'kokku-lahku', 'Parandatud on sõna kokku-lahku kirjutamist.']
rabauti's avatar
rabauti committed
859
		]},
rabauti's avatar
rabauti committed
860
	
rabauti's avatar
rabauti committed
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
	'valelemma' : { 
		'order':'4',
		'rows': [
				[ 'valelemma', 'Sõnavorm on sama, lemma on erinev. (Mitmesuse puhul otsitakse ühisosa.) Vaadatakse originaallauses/paranduses samal positsioonil olevaid sõnu.']
		]},
	'valevorm' : { 
		'order':'5',
		'rows': [
				[ 'valevorm', 'Lemma on sama. Sõnavorm on erinev. (Mitmesuse puhul otsitakse ühisosa.) Vaadatakse originaallauses/paranduses samal positsioonil olevaid sõnu.']
		]},
	'parallelvorm' : { 
		'order':'5',
		'rows': [
				[ 'parallelvorm', 'Sõnakuju on erinev. Lemma on sama. Sõnavorm on sama. (Mitmesuse puhul otsitakse ühisosa.) Vaadatakse originaallauses/paranduses samal positsioonil olevaid sõnu.']
		]},
	'sõnalisatud' : { 
		'order':'5',
		'rows': [
				[ 'sõnalisatud', 'Lauses oli sõna puudu.']
		]},
	'sõnaüle' : { 
		'order':'5',
		'rows': [
rabauti's avatar
rabauti committed
884
				[ 'sõnaüle', 'Lauses  oli üleliigne sõna.']
rabauti's avatar
rabauti committed
885
886
887
888
889
890
891
892
893
		]},
		
}


filename2 = 'tulemus/kirjeldus.html'
file_out = open(filename2, 'w')
file_out.write( '<html> <meta charset="UTF-8"><head></head>') 
file_out.write( "<body>") 
rabauti's avatar
rabauti committed
894
file_out.write( '<h5>%s</h5>' % str(datetime.now()))
rabauti's avatar
rabauti committed
895
896
file_out.write( '<h1>Tüübid</h1>')

rabauti's avatar
rabauti committed
897

rabauti's avatar
rabauti committed
898
for key in sorted(errorDecriptions.keys()):
rabauti's avatar
rabauti committed
899

rabauti's avatar
rabauti committed
900
	
rabauti's avatar
rabauti committed
901
902
903
904
905
906
907
908
909
	for row in errorDecriptions[key]['rows']:
		file_out.write( '<p><b>%s</b><p><p style="padding-left:20px">%s</p>' % (row[0] , row[1]))
		
file_out.write ('</body></html>') 
file_out.close()


	

rabauti's avatar
rabauti committed
910
exit()