tokenizer.py 3.54 KB
Newer Older
1
#!/usr/bin/python3
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#coding: utf-8

import sys, getopt
from estnltk import Text
import codecs
import pkg_resources
import datetime
import json

def json_repr(obj):
    """   Represent instance of a class as JSON.
        Arguments:
            obj -- any object
        Return:
            String that represent JSON-encoded object.
    """
18
19
20
    def serialize(obj, l):
        # Recursively walk object's hierarchy.
        if isinstance(obj, (bool, int, float)):
21
            return obj
22
23
24
25
26
        elif isinstance(obj, str):
            return obj
        elif isinstance(obj, bytes):
            return obj.decode("utf-8")
        elif isinstance(obj, dict) and l > 0:
27
28
            obj = obj.copy()
            for key in obj:
29
                obj[key] = serialize(obj[key], l-1)
30
            return obj
31
32
33
34
35
36
        elif isinstance(obj, list) and l > 0:
            return [serialize(item, l-1) for item in obj]
        elif isinstance(obj, tuple) and l > 0:
            return tuple(serialize([item for item in obj], l-1))
        elif hasattr(obj, '__dict__') and l > 0:
            return serialize(obj.__dict__, l-1)
37
38
        else:
            return repr(obj) # Don't know how to handle, convert to string
39
40
    return json.dumps(serialize(obj, 20))

41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

def write_header(outfile, inputfile):
    header = {'content':inputfile, 'coding':'utf-8', 'layer':'words',
              'processor':'estnltk',
              'version': pkg_resources.get_distribution("estnltk").version,
              'time': datetime.datetime.now().isoformat(),
              'description':'This file contains list of words with positions extracted from the content.',
              'structure':'data:[word*:{word_index,token,start,end}]'
              }
    outfile.write(json_repr(header) + u'\n')
    
def write_mapping(outfile):
    mapping = {'fields':[
        {'name':'word', 'label':'', 'type':'O', 'ctx':'data'},
        {'name':'word_index','label':'i', 'type':'N', 'ctx':'word'},
        {'name':'token', 'label':'t', 'type':'S', 'ctx':'word'},
        {'name':'start', 'label':'s', 'type':'N', 'ctx':'word'},
        {'name':'end', 'label':'e', 'type':'N', 'ctx':'word'}
        ]}
    outfile.write(json_repr(mapping) + u'\n')

def main(argv):
    inputfile = ''
    outputfile = ''
65
    helptext = 'Usage: python3 tokenizer.py -i <inputfile> -o <outputfile>'
66
    if len(argv) == 0:
67
        print (helptext)
68
69
70
71
        sys.exit()
    try:
        opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
    except getopt.GetoptError:
72
        print (helptext)
73
74
75
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
76
            print (helptext)
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-o", "--ofile"):
            outputfile = arg
    infile = codecs.open(inputfile, "r", "utf-8")
    outfile = codecs.open(outputfile, "w", "utf-8")
    outfile.write('{\n')
    outfile.write('"header":')
    write_header(outfile, inputfile)
    outfile.write(', "mapping":')
    write_mapping(outfile)
    offset = 0
    idx = 0
    sep = ''
    outfile.write(', "data":[\n')
    for line in infile:
        doc = Text(line)
95
        doc.tag_layer()
96
97
98
        tokens = doc['words']
        last_end = 0
        for t in tokens:
99
            outfile.write(sep+'{"i":'+str(idx)+', "t":"'+t.text+'", "s":'+str(t.start+offset)+', "e":'+str(t.end+offset)+'}\n')
100
            idx += 1
101
            last_end = t.end
102
103
104
105
106
107
108
109
110
            sep = ','
        offset += last_end + 1
    outfile.write(']\n}\n')
    outfile.close()
    infile.close()
    

if __name__ == "__main__":
    main(sys.argv[1:])