tokenizer.py 3.39 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
#!/usr/bin/python
#coding: utf-8

import sys, getopt
from estnltk import Text
import codecs
import pkg_resources
import datetime
import json

def json_repr(obj):
    """   Represent instance of a class as JSON.
        Arguments:
            obj -- any object
        Return:
            String that represent JSON-encoded object.
    """
    def serialize(obj):
        # Recursively walk object's hierarchy. 
        if isinstance(obj, (bool, int, long, float, basestring)):
            return obj
        elif isinstance(obj, dict):
            obj = obj.copy()
            for key in obj:
                obj[key] = serialize(obj[key])
            return obj
        elif isinstance(obj, list):
            return [serialize(item) for item in obj]
        elif isinstance(obj, tuple):
            return tuple(serialize([item for item in obj]))
        elif hasattr(obj, '__dict__'):
            return serialize(obj.__dict__)
        else:
            return repr(obj) # Don't know how to handle, convert to string
    return json.dumps(serialize(obj), ensure_ascii=False)

def write_header(outfile, inputfile):
    header = {'content':inputfile, 'coding':'utf-8', 'layer':'words',
              'processor':'estnltk',
              'version': pkg_resources.get_distribution("estnltk").version,
              'time': datetime.datetime.now().isoformat(),
              'description':'This file contains list of words with positions extracted from the content.',
              'structure':'data:[word*:{word_index,token,start,end}]'
              }
    outfile.write(json_repr(header) + u'\n')
    
def write_mapping(outfile):
    mapping = {'fields':[
        {'name':'word', 'label':'', 'type':'O', 'ctx':'data'},
        {'name':'word_index','label':'i', 'type':'N', 'ctx':'word'},
        {'name':'token', 'label':'t', 'type':'S', 'ctx':'word'},
        {'name':'start', 'label':'s', 'type':'N', 'ctx':'word'},
        {'name':'end', 'label':'e', 'type':'N', 'ctx':'word'}
        ]}
    outfile.write(json_repr(mapping) + u'\n')

def main(argv):
    inputfile = ''
    outputfile = ''
    helptext = 'Usage: python tokenizer.py -i <inputfile> -o <outputfile>'
    if len(argv) == 0:
        print helptext
        sys.exit()
    try:
        opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
    except getopt.GetoptError:
        print helptext
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print helptext
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-o", "--ofile"):
            outputfile = arg
    infile = codecs.open(inputfile, "r", "utf-8")
    outfile = codecs.open(outputfile, "w", "utf-8")
    outfile.write('{\n')
    outfile.write('"header":')
    write_header(outfile, inputfile)
    outfile.write(', "mapping":')
    write_mapping(outfile)
    offset = 0
    idx = 0
    sep = ''
    outfile.write(', "data":[\n')
    for line in infile:
        doc = Text(line)
        doc.tokenize_words()
        tokens = doc['words']
        last_end = 0
        for t in tokens:
            outfile.write(sep+'{"i":'+str(idx)+', "t":"'+t['text']+'", "s":'+str(t['start']+offset)+', "e":'+str(t['end']+offset)+'}\n')
            idx += 1
            last_end = t['end']
            sep = ','
        offset += last_end + 1
    outfile.write(']\n}\n')
    outfile.close()
    infile.close()
    

if __name__ == "__main__":
    main(sys.argv[1:])