Commit 632d1594 authored by Timo Petmanson's avatar Timo Petmanson
Browse files

Wrote some tutorials for grammarextractor

parent 1318d127
estnltk.grammar module
.. automodule:: estnltk.grammar.grammar
......@@ -143,6 +143,7 @@ API reference
......@@ -25,6 +25,12 @@ Example of using the memory switch::
./elasticsearch --ES_MAX_MEM=4g
Connecting to the server
By default, Elastic tries to connect to localhost .
Bulk importing data
......@@ -3,10 +3,111 @@ Simple grammars for information extraction
.. highlight:: python
Estnltk comes with simple grammar constructs that are useful for basic information extraction.
Consider that you have a recipe for making panncakes::
recipe = '''
2,5 dl piima
1,5 dl jahu
1 muna
1 tl suhkrut
1 tl vaniljeekstrakti
0,5 tl soola
Suppose you want to create a robot that can cook various meals.
In order to program that robot, you need a software module, which can parse recipes.
This is where Estnltk's ``estnltk.grammar.grammar`` module can help you.
In the above example, we need to parse the numbers, unit and the name of the ingredient
into more managenable form than free-text::
from estnltk import Regex, Lemmas
number = Regex('\d+([,.]\d+)?', name='amount')
unit = Lemmas('dl', 'tl', name='unit')
ingredient = Lemmas('piim', 'jahu', 'muna', 'suhkur', 'vaniljeekstrakt', 'sool', name='ingredient')
Now, there are two types of instructions::
from estnltk import Concatenation
space = Regex('\s*')
full_instruction = Concatenation(number, unit, ingredient, sep=space)
short_instruction = Concatenation(number, ingredient, sep=space)
And we want to capture them both::
from estnltk import Union
instruction = Union(full_instruction, short_instruction, name='instruction')
Basically, a grammar contains a number of symbols that can be chained together in various ways
and rigged for information extraction.
Above grammar just extracts numbers defined by a regular expression, and units and ingredients
based on user given lists.
Now, going back to our robot example, we can extract the data from text using ``get_matches`` method::
from estnltk import Text
from pprint import pprint
text = Text(recipe)
for match in instruction.get_matches(text):
The ``dict`` attribute of each :py:class:`~estnltk.grammar.match.Match` instance can be used
to access the symbol's name, matched text, start and end positions and also all submatches::
{'amount': {'end': 4, 'start': 1, 'text': '2,5'},
'ingredient': {'end': 13, 'start': 8, 'text': 'piima'},
'instruction': {'end': 13, 'start': 1, 'text': '2,5 dl piima'},
'unit': {'end': 7, 'start': 5, 'text': 'dl'}}
'ingredient': {'end': 80, 'start': 75, 'text': 'soola'},
'instruction': {'end': 80, 'start': 68, 'text': '0,5 tl soola'},
'unit': {'end': 74, 'start': 72, 'text': 'tl'}}
You can also use the symbols to tag layers directly in :py:class:`~estnltk.text.Text` instances::
Let's use prettyprinter to visualize this as HTML::
from estnltk import PrettyPrinter
pp = PrettyPrinter(background='instruction', underline='ingredient', weight='unit')
pp.render(text, add_header=True)
.. raw:: html
mark.background {
background-color: rgb(102, 204, 255);
mark.weight {
font-weight: bold;
mark.underline {
text-decoration: underline;
<mark class="background">2,5 </mark><mark class="background weight">dl</mark><mark class="background"> </mark><mark class="background underline">piima</mark><br/><mark class="background">1,5 </mark><mark class="background weight">dl</mark><mark class="background"> </mark><mark class="background underline">jahu</mark><br/><mark class="background">1 </mark><mark class="background underline">muna</mark><br/><mark class="background">1 </mark><mark class="background weight">tl</mark><mark class="background"> </mark><mark class="background underline">suhkrut</mark><br/><mark class="background">1 </mark><mark class="background weight">tl</mark><mark class="background"> </mark><mark class="background underline">vaniljeekstrakti</mark><br/><mark class="background">0,5 </mark><mark class="background weight">tl</mark><mark class="background"> </mark><mark class="background underline">soola</mark><br/>
You can access the annotated layers as you would access typical layers::
... content ...
[{'end': 13, 'start': 8, 'text': 'piima'},
{'end': 25, 'start': 21, 'text': 'jahu'},
{'end': 32, 'start': 28, 'text': 'muna'},
{'end': 45, 'start': 38, 'text': 'suhkrut'},
{'end': 67, 'start': 51, 'text': 'vaniljeekstrakti'},
{'end': 80, 'start': 75, 'text': 'soola'}]
See package ``estnltk.grammar.examples`` for more examples.
\ No newline at end of file
......@@ -11,4 +11,4 @@ from .clausesegmenter import ClauseSegmenter
from .disambiguator import Disambiguator
from .prettyprinter import PrettyPrinter
from .database import Database
from .grammar import *
......@@ -5,7 +5,8 @@ import regex as re
from functools import reduce
from itertools import chain
from collections import defaultdict
import six
from ..text import Text
from .match import Match, concatenate_matches, copy_rename, intersect
from .conflictresolver import resolve_using_maximal_coverage
......@@ -21,6 +22,8 @@ class Symbol(object):
return self.__name
def annotate(self, text, conflict_resolver=resolve_using_maximal_coverage):
if isinstance(text, six.string_types):
text = Text(text)
matches = self.get_matches(text, conflict_resolver=conflict_resolver)
layers = defaultdict(list)
for m in matches:
......@@ -247,8 +250,22 @@ def concat(matches_a, matches_b, text, name=None):
class Concatenation(Symbol):
def __init__(self, *symbols, **kwargs):
symbol.. : list of :py:class:`~estnltk.grammar.Symbol`
The symbols that are coing to be concatenated.
sep: :py:class:`~estnltk.grammar.Symbol`
The optional separator symbol.
super(Concatenation, self).__init__(kwargs.get('name'))
self.__symbols = symbols
sep = kwargs.get('sep', None)
self.__symbols = []
for idx, sym in enumerate(symbols):
if idx > 0:
def symbols(self):
......@@ -47,8 +47,10 @@ class Match(dict):
def dict(self):
res = copy(self)
del res[MATCHES]
del res[NAME]
if MATCHES in res:
del res[MATCHES]
if NAME in res:
del res[NAME]
res = { res}
for k, v in self.matches.items():
res[k] = v
......@@ -55,7 +55,7 @@ AES_CSS_MAP = {
FONT: 'font-family',
WEIGHT: 'font-weight',
ITALICS: 'font-style',
UNDERLINE: 'font-decoration',
UNDERLINE: 'text-decoration',
SIZE: 'font-size',
TRACKING: 'letter-spacing'
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function, absolute_import
import unittest
import os
import timeit
from pprint import pprint
from ..text import Text
from ..teicorpus import parse_tei_corpus
from ..core import AA_PATH
dummy = Text('Tere maailm!').tag_analysis()
docs = parse_tei_corpus(os.path.join(AA_PATH, 'tea_AA_00_1.tasak.xml'))
plain = docs[5].text
n = len(plain)//2
half1, half2 = plain[:n], plain[n:]
def large_document():
def small_documents():
class LargeTextTest(unittest.TestCase):
"""Test for ensuring that basic processing time of texts has linear complexity.
This is good for detecting inefficient loops that depend on text size/complexity.
def test_time(self):
number = 10
large_time = timeit.timeit(large_document, number=number)
small_time = timeit.timeit(small_documents, number=number)
print('Large document: ', large_time)
print('Small documents:', small_time)
diff = abs((float(large_time) / float(small_time)) - 1.0)
self.assertTrue(diff < 0.1) # fail with 10% difference
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment