Commit 7c2a053b authored by Neeme Kahusk's avatar Neeme Kahusk
Browse files

uus maci skript ineksiteks

parent 07052fcc
#!/bin/bash
#
# makes indexes form eurown export file
#
usage ()
{
echo 'Usage : make-indexes.sh <wordnet export file>'
exit
}
if [ "$#" -ne 1 ]
then
usage
fi
EXPORTFILE=$1
EX_EXT=`echo $EXPORTFILE |sed -E "s/^[.]*[^.]+[.]([^.]+)$/\1/1"`
# Do we have exportfile extension?
echo 'Do we have exportfile extension?'
echo $EX_EXT
# all operations are done on NORMFILE!
# extensions of index files
NORMEXT='norm' # normfile
SOIEXT='soi' # synset offset index
RLXEXT='rlx' # raw literal index file
LLXEXT='lix' # literal index file
TIXEXT='tix' # tuple index file
RELEXT='rix' # relations index file
ILIEXT='iix' # ili relations index file
ILAEXT='iax' # ili relations add on index file
# Do we have unique Synset numbers?
NO_OF_ORIGSYNSETS=$( cat $1 |egrep '^0'|sort|wc -l|bc -l )
# NO_OF_ORIGSYNSETS=$( printf "%0i" $NO_OF_ORIGSYNSETS )
echo "$NO_OF_ORIGSYNSETS"
NO_OF_UNIQESYNSETS=$( cat $1 |egrep '^0'|sort|sort -u|wc -l|bc -l )
# NO_OF_UNIQUESYNSETS=$(( $NO_OF_UNIQUESYNSETS*1 ))
echo "$NO_OF_UNIQESYNSETS"
if [ "$NO_OF_ORIGSYNSETS" -eq "$NO_OF_UNIQESYNSETS" ] ; then
echo "Synset numbers are unique"
else
echo "Synset numbers ARE NOT unique"
cat $1 |grep '^0'|sed -E 's/(0 @)([0-9]+)(@.+$)/\2/g'|sort|uniq > all.numbers
cat $1 |grep '^0'|sed -E 's/(0 @)([0-9]+)(@.+$)/\2/g'|sort|uniq -c|sort -nr|egrep '^ {3,6}[2-9]'|sed -E 's/^ {3,6}[2-9] //g' > doubled.numbers
cp $1 tempfile
python unique_synsets.py all.numbers doubled.numbers tempfile > $EXPORTFILE
# rm all.numbers
# rm doubled.numbers
fi
# echo "$NO_OF_ORIGSYNSETS"
# echo "$NO_OF_UNIQESYNSETS"
# exit
NORMFILE=`echo $EXPORTFILE|sed -E "s/(^[.]*[^.]+[.])($EX_EXT)$/\1$NORMEXT/1"`
# echo $NORMFILE
echo 'Making index files...'
OFFSET_IDX_FILE=`echo $EXPORTFILE|sed -E "s/(^[.]*[^.]+[.])($EX_EXT)$/\1$SOIEXT/1"`
RAWLIT_IDX_FILE=`echo $EXPORTFILE|sed -E "s/(^[.]*[^.]+[.])($EX_EXT)$/\1$RLXEXT/1"`
LIT_IDX_FILE=`echo $EXPORTFILE|sed -E "s/(^[.]*[^.]+[.])($EX_EXT)$/\1$LLXEXT/1"`
TUPLE_IDX_FILE=`echo $EXPORTFILE|sed -E "s/(^[.]*[^.]+[.])($EX_EXT)$/\1$TIXEXT/1"`
REL_IDX_FILE=`echo $EXPORTFILE|sed -E "s/(^[.]*[^.]+[.])($EX_EXT)$/\1$RELEXT/1"`
ILI_IDX_FILE=`echo $EXPORTFILE|sed -E "s/(^[.]*[^.]+[.])($EX_EXT)$/\1$ILIEXT/1"`
ILI_ADD_IDX_FILE=`echo $EXPORTFILE|sed -E "s/(^[.]*[^.]+[.])($EX_EXT)$/\1$ILAEXT/1"`
# make normfile, delete carriage returns (^M or 015)
cat $EXPORTFILE |tr -d '\015' > $NORMFILE
# make synset offset index file
# file is in the following format:
# <synset_number>:<offset_in_file>
# ---------------------------------
echo 'Making synset offset index file...'
cat $NORMFILE |grep -b '0 @'|\
sed 's/0 @//g; s/@ WORD_.*//g'|\
sed -E 's/([^:]+):([^:]+)/\2:\1/g' > $OFFSET_IDX_FILE
echo 'Done!'
# Make raw literal index file
# File is in the following format:
# <literal>:<offset_in_file>
# ---------------------------------
echo 'Making raw literal index file...'
pwd
echo $0
BASEDIR=$(dirname $0)
echo $BASEDIR
echo $BASEDIR/make_litindex.py
cat $NORMFILE |grep -b '.*' |sed 's/:/ /1'|awk '/0 @/ {jama=$3} #; print $1,$3}
/2 LITERAL/ {print jama,$0}'|tr -d '@'|grep -v '@'|sed -E 's/^([0-9]+) ([0-9]+) *2 LITERAL (.*)$/\1:\3/g'|tr -d '"'|sed -E 's/^([^:]+):(.*)$/\2:\1/g' > $RAWLIT_IDX_FILE
echo 'Done!'
echo 'Making literal index file...'
if [ -f $LIT_IDX_FILE ];
then
echo "File $LIT_IDX_FILE exists, removing"
rm $LIT_IDX_FILE
fi
python $BASEDIR/make_litindex.py $RAWLIT_IDX_FILE $LIT_IDX_FILE
echo 'Done!'
echo 'Making raw tuple index file...'
cat $NORMFILE |\
# grep -b '.*' |\
# sed 's/:/ /1'|\
awk '/0 @/ {jama=substr($2,2,length($2)-2)}
/1 PART_OF_SPEECH/ {pos=$NF}
/2 LITERAL/ {out=jama":"pos":"; for(i=3;i<=NF;i++){out=out" "$i}}
/3 SENSE/ {print out":"$NF}' |tr -d '"'|sed 's/: /:/1' > $TUPLE_IDX_FILE
echo 'Done!'
echo 'Making relations index file...'
cat $NORMFILE |\
awk -f $BASEDIR/mkrelidx.awk|tr -d '"'|sed -E 's/: +/:/g' > $REL_IDX_FILE
echo 'Done!'
echo 'Making ili relations index file...'
cat $NORMFILE |\
# grep -b '.*' |\
# sed 's/:/ /1'|\
awk '/0 @/ {jama=$2}
/1 PART_OF_SPEECH/ {pos=$NF}
/1 [[:upper:]]+_LINKS/ {one=$NF}
/2 EQ_RELATION/ {rel=$NF}
/4 PART_OF_SPEECH/ {targetpos=$NF}
/4 WORDNET_OFFSET/ {targetliteral=$NF; printf "%s:%s:%s:%s:%s\n", jama,pos,rel,targetpos,targetliteral}
'|tr -d '"' |tr -d '@' > $ILI_IDX_FILE
echo 'Done!'
echo 'Making ili relations add on index file...'
cat $NORMFILE |\
# grep -b '.*' |\
# sed 's/:/ /1'|\
awk '/0 @/ {jama=$2}
/1 PART_OF_SPEECH/ {pos=$NF}
/1 [[:upper:]]+_LINKS/ {one=$NF}
/2 EQ_RELATION/ {rel=$NF}
/4 PART_OF_SPEECH/ {targetpos=$NF}
/4 ADD_ON_ID/ {targetliteral=$NF; printf "%s:%s:%s:%s:%s\n", jama,pos,rel,targetpos,targetliteral}
'|tr -d '"' |tr -d '@' > $ILI_ADD_IDX_FILE
echo 'Done!'
echo 'All indexes done!'
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment