Commit 1a2f57b3 authored by Neeme Kahusk's avatar Neeme Kahusk
Browse files

encoding

parent 514fff3c
......@@ -15,6 +15,7 @@ import timing
# import argparse
import sys
import os
import codecs
# print os.environ['HOME']
from lxml import etree
......@@ -128,7 +129,7 @@ def parse_wn_index(path):
"""
SYNSET_TYPE = {'1':'n','2':'v','3':'a','4':'r','5':'s'}
FILE = path + '/' + 'index.sense'
with open(FILE, 'r') as f:
with codecs.open(FILE, 'r','utf8') as f:
lines = map(lambda x: x.split(), f.readlines())
oDict = {}
for i in lines:
......@@ -251,7 +252,7 @@ def parse_dataline(iList,indexDict=WNI):
# of 0 is the default, and therefore is not present in
# lexicographer files.
while wordCounter:
oDict['synonym'].append({'word':first.pop(0),
oDict['synonym'].append({'word':first.pop(0).decode('utf8'),
'lex_id':first.pop(0)}
)
wordCounter -=1
......@@ -356,7 +357,7 @@ def wnparser(path,pos,start=0,end=-1):
"""Parses Princeton wordnet data file"""
FILE = path + '/' + 'data.' + pos
oList = []
with open(FILE, 'r') as f:
with codecs.open(FILE, 'r','utf8') as f:
lines = f.readlines()
licenceList = filter(lambda x: x.startswith(' '),lines)
contentList = map(lambda x: x.strip().split('|'),
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment