morfDiff.py 2.78 KB
Newer Older
rabauti's avatar
rabauti committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#/usr/bin/python
# -*- coding: utf-8 -*-


import os
import sys
import re
import  difflib
import  copy


from  difflib import Differ
from estnltk import Text


originals 	= {} 
corrections 	= {} 

script_name = (os.path.realpath(__file__))
script_dir = os.path.dirname(script_name)


filename = script_dir +'/../korpus_tsv/korpus.tsv'
outdir = script_dir + '/morf'

file_input = open(filename, 'r')

for line in file_input:
	
	line = line.rstrip()
	line_arr = line.split('\t')
	
	if not len(line_arr) == 2:
		print ('ERROR :' , line)
		exit(1)
	
	uid = line_arr[0]
	text = line_arr[1]
	
	uid_arr = uid.split('_')
	uid_ending = uid_arr.pop()
	uid2 = '_'.join(uid_arr)
	el_type = uid_ending[0]
	
	if el_type == 'a':
		originals[uid2] = {'id': uid, 'text':text}
	
	if el_type == 'p':
		if not uid2 in corrections:
			corrections[uid2] = []
		corrections[uid2].append({'id': uid, 'text':text})

d = difflib.HtmlDiff( tabsize=4, wrapcolumn=40 )

for uid in corrections.keys():
	
	
	for (i, correction) in enumerate(corrections[uid]):
		filename2 =  '%s/%s.html' % (outdir, corrections[uid][i]['id'])
		
		#correction = corrections[uid][i]
		#print (originals[uid])
		#print (correction)
		
		
		finished = 0
		text1_copy  = Text(originals[uid]['text'])
		text2_copy= Text(correction['text'])
		
		text1_lemmas = text1_copy.lemmas
		text2_lemmas = text2_copy.lemmas
		
		
		text1_word_texts = text1_copy.word_texts
		text2_word_texts = text2_copy.word_texts
		
		text1_postags = text1_copy.postags
		text2_postags = text2_copy.postags
	
		text1_forms = text1_copy.forms
		text2_forms = text2_copy.forms
	
	
		
		html_diff_result = d.make_table("\n".join(text1_word_texts).splitlines(1), "\n".join(text2_word_texts).splitlines(1))
		
		
		
		text1_tsv_rows = []
		for (ind, word) in enumerate(text1_word_texts):
			text1_tsv_rows.append("\t".join([text1_word_texts[ind], text1_lemmas[ind], text1_postags[ind], text1_forms[ind] ]))
		
		text2_tsv_rows = []
		for (ind, word) in enumerate(text2_word_texts):
			text2_tsv_rows.append("\t".join([text2_word_texts[ind], text2_lemmas[ind], text2_postags[ind], text2_forms[ind]]))
			
		
		text1_tsv = ''
		html_diff_result += d.make_file("\n".join(text1_tsv_rows).splitlines(1), "\n".join(text2_tsv_rows).splitlines(1))
		
		
		
		
		#html_diff_result += d.make_table("\n".join(text1_lemmas).splitlines(1), "\n".join(text2_lemmas).splitlines(1))
		#html_diff_result += d.make_table("\n".join(text1_postags).splitlines(1), "\n".join(text2_postags).splitlines(1))
		#html_diff_result += d.make_table("\n".join(text1_forms).splitlines(1), "\n".join(text2_forms).splitlines(1))
		
		
		
		file_out = open(filename2, 'w')
		file_out.write(html_diff_result)
		#file_out.write(html_diff_result)
		#file_out.write('</body>')
		#file_out.write('</html>')
		file_out.close()
		
		#print ( ''.join(unified_diff_result))