Commit b1121a1d authored by Neeme Kahusk's avatar Neeme Kahusk
Browse files

detect non-unique synset numbers

parent 544b55e0
...@@ -11,6 +11,26 @@ EX_EXT=`echo $EXPORTFILE |sed -E "s/[^.]+[.]([^.]+)$/\1/1"` ...@@ -11,6 +11,26 @@ EX_EXT=`echo $EXPORTFILE |sed -E "s/[^.]+[.]([^.]+)$/\1/1"`
echo 'Do we have exportfile extension?' echo 'Do we have exportfile extension?'
echo $EX_EXT echo $EX_EXT
# Do we have unique Synset numbers?
NO_OF_ORIGSYNSETS=$( cat $1 |egrep '^0'|sort|wc -l|bc -l )
# NO_OF_ORIGSYNSETS=$( printf "%0i" $NO_OF_ORIGSYNSETS )
echo "$NO_OF_ORIGSYNSETS"
NO_OF_UNIQESYNSETS=$( cat $1 |egrep '^0'|sort|sort -u|wc -l|bc -l )
# NO_OF_UNIQUESYNSETS=$(( $NO_OF_UNIQUESYNSETS*1 ))
echo "$NO_OF_UNIQESYNSETS"
if [ "$NO_OF_ORIGSYNSETS" -eq "$NO_OF_UNIQESYNSETS" ] ; then
echo "Synset numbers are unique"
else
echo "Synset numbers ARE NOT unique"
cat $1 |grep '^0'|sed -E 's/(0 @)([0-9]+)(@.+$)/\2/g'|sort|uniq > all.numbers
cat $1 |grep '^0'|sed -E 's/(0 @)([0-9]+)(@.+$)/\2/g'|sort|uniq -c|sort -nr|egrep '^ {3}2'|sed -E 's/^ {3}2 //g' > doubled.numbers
fi
echo "$NO_OF_ORIGSYNSETS"
echo "$NO_OF_UNIQESYNSETS"
exit
# all operations are done on NORMFILE! # all operations are done on NORMFILE!
# extensions of index files # extensions of index files
NORMEXT='norm' # normfile NORMEXT='norm' # normfile
...@@ -21,10 +41,13 @@ TIXEXT='tix' # tuple index file ...@@ -21,10 +41,13 @@ TIXEXT='tix' # tuple index file
RELEXT='rix' # relations index file RELEXT='rix' # relations index file
ILIEXT='iix' # ili relations index file ILIEXT='iix' # ili relations index file
ILAEXT='iax' # ili relations add on index file ILAEXT='iax' # ili relations add on index file
TEMPEXT='tmp' # temporary file (for synset numbers)
NORMFILE=`echo $EXPORTFILE|sed -E "s/([^.]+[.])($EX_EXT)$/\1$NORMEXT/1"` NORMFILE=`echo $EXPORTFILE|sed -E "s/([^.]+[.])($EX_EXT)$/\1$NORMEXT/1"`
# echo $NORMFILE # echo $NORMFILE
TEMPFILE=`echo $EXPORTFILE|sed -E "s/([^.]+[.])($EX_EXT)$/\1$TEMPEXT/1"`
echo 'Making index files...' echo 'Making index files...'
OFFSET_IDX_FILE=`echo $EXPORTFILE|sed -E "s/([^.]+[.])($EX_EXT)$/\1$SOIEXT/1"` OFFSET_IDX_FILE=`echo $EXPORTFILE|sed -E "s/([^.]+[.])($EX_EXT)$/\1$SOIEXT/1"`
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'Neeme Kahusk <neeme.kahusk@ut.ee>'
__version__ = '1.0'
__date__ = 'T nov 20 15:24:18 EET 2012'
from optparse import OptionParser
# Option Parser
#---------------------------------------
parser = OptionParser(usage="%prog [options]",
version="%prog "+__version__+' ('+__date__+')')
(options, args) = parser.parse_args()
if len(args) != 2:
parser.error("incorrect number of arguments")
RAW_INDEX_FN = args[0]
LITERAL_INDEX_FN = args[-1]
def read_raw_index(filename):
f = open(filename)
lines = map(lambda x: x.strip().split(':'),f.readlines())
f.close()
return lines
def make_literal_index(iList):
"""iList should be list of lists with 2 members
"""
oDict = dict()
for i in iList:
# print i
try:
if oDict and i[0] in oDict:
oDict[i[0]].append(i[1])
else:
oDict.update({i[0]:[i[1]]})
except KeyError:
print 'võtme viga',i
return oDict
ri = read_raw_index(RAW_INDEX_FN)
def write_index(iDict,filename):
"""Write index (dict) to file
"""
f = open(filename,'a')
for i in iDict:
try:
oStr = u'%s:%s\n' % (i.decode('utf8'),
u' '.join(iDict[i])
)
except UnicodeDecodeError:
print iDict[i]
f.write(oStr.encode('utf-8'))
f.close()
litindex = make_literal_index(ri)
write_index(litindex,LITERAL_INDEX_FN)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment