Commit 025137e3 authored by Timo Petmanson's avatar Timo Petmanson
Browse files

Merge branch 'devel' of github.com:estnltk/estnltk into devel

* 'devel' of github.com:estnltk/estnltk:
  Added experimental NP_Chunker
  Changed the subprocess command of maltparser
  Added plceholdr for experimental NP chunker's tutorial.
  Fixed maltparser installation bug
  Refactoring elasticsearch interface
  Added plceholdr for MaltParser's tutorial
  Create elasticsearch.rst
  https://github.com/estnltk/estnltk/issues/28: Modified the tests and added subprocess termination to Java based tools - TimexTagger and ClauseSegmenter
  Added Maltparser support
parents d058ceb6 57a8f19d
TODO
\ No newline at end of file
TODO
\ No newline at end of file
from .database import Database
\ No newline at end of file
from .database import Database
from .elastic import connect, create_index, Index
......@@ -2,18 +2,17 @@
from __future__ import unicode_literals, print_function, absolute_import
import copy
import itertools
import json
import elasticsearch
import elasticsearch.helpers
import itertools
import json
from .mapping import mapping
from ..text import Text
def create_index(index_name, **kwargs):
"""
Parameters
----------
index_name : str
......@@ -25,9 +24,7 @@ def create_index(index_name, **kwargs):
-------
Index
"""
mapping = json.load(open('mapping.json'))
es = elasticsearch.Elasticsearch(**kwargs)
es.indices.create(index=index_name, body=mapping)
return connect(index_name, **kwargs)
......@@ -54,14 +51,13 @@ def connect(index_name, **kwargs):
class Index:
def __init__(self, client, index_name):
def __init__(self, index_name):
"""
Parameters
----------
client : elasticsearch.Elasticsearch
client : Elasticsearch
index_name : str
"""
self.index_name = index_name
self.client = client
......@@ -70,16 +66,6 @@ class Index:
def sentences(self, exclude_ids=None, query=None, **kwargs):
if exclude_ids is None:
for document in elasticsearch.helpers.scan(self.client, query=query, doc_type='sentence', **kwargs):
# text = Text(document['estnltk_text'])
# text.__db_meta = document['meta']
# yield text
# for i in index.sentences(query={
#
# 'fields':['estnltk_text_object']
# }):
# print(i)
yield Text(json.loads(document['fields']['estnltk_text_object'][0]))
else:
raise NotImplementedError('ID exclusion is not implemented')
......@@ -125,12 +111,27 @@ class Index:
}
yield json.dumps(sentence)
def save(self, document):
def save(self, document, meta=None):
if getattr(document, '__db_meta', None):
# we should overwrite a previous object
raise NotImplementedError
raise NotImplementedError('Changing objects in the database has not been implemented.')
else:
# we should create a new object
document_in_es = self.client.index(self.index_name, 'document', {})
document_in_es = self.client.index(self.index_name, 'document', {} if meta is None else meta)
for sent in self._get_indexable_sentences(document):
self.client.index(self.index_name,
'sentence',
sent,
parent=document_in_es['_id'])
def get_iter(self, document, meta=None):
if getattr(document, '__db_meta', None):
# we should overwrite a previous object
raise NotImplementedError('Changing objects in the database has not been implemented.')
else:
# we should create a new
yield ('document', {} if meta is None else meta)
for sent in self._get_indexable_sentences(document):
self.client.index(self.index_name, 'sentence', sent, parent=document_in_es['_id'])
yield ('sentence',
sent
)
# -*- coding:utf-8 -*-
from __future__ import print_function, absolute_import
from estnltk import Text
from estnltk.database import elastic
import elastic
from ..text import Text
try:
index = elastic.create_index('example_index')
......
{
"mappings": {
"document": {
"_all": {
"enabled": false
},
"properties": {
"meta": {
"type": "object"
}
}
},
"flag": {
"_all": {
"enabled": false
},
"_parent": {
"type": "sentence"
},
"properties": {
"value": {
"doc_values": true,
"index": "not_analyzed",
"norms": {
"enabled": false
},
"type": "string"
}
}
},
"sentence": {
"_all": {
"enabled": false
},
"_parent": {
"type": "document"
},
"properties": {
"estnltk_text_object": {
"index": "no",
"store": true,
"type": "string"
},
"lemmas": {
"analyzer": "estnltk_lowercase",
"norms": {
"enabled": false
},
"type": "string",
"position_increment_gap": 100
},
"meta": {
"properties": {
"order_in_parent": {
"doc_values": true,
"norms": {
"enabled": false
},
"type": "long"
}
}
},
"postags": {
"analyzer": "estnltk_uppercase",
"norms": {
"enabled": false
},
"type": "string",
"position_increment_gap": 100
},
"text": {
"analyzer": "whitespace",
"norms": {
"enabled": false
},
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"words": {
"properties": {
"analysis": {
"properties": {
"clitic": {
"doc_values": true,
"index": "not_analyzed",
"norms": {
"enabled": false
},
"type": "string"
},
"ending": {
"doc_values": true,
"index": "not_analyzed",
"norms": {
"enabled": false
},
"type": "string"
},
"form": {
"doc_values": true,
"index": "not_analyzed",
"norms": {
"enabled": false
},
"type": "string"
},
"lemma": {
"doc_values": true,
"index": "not_analyzed",
"norms": {
"enabled": false
},
"type": "string"
},
"partofspeech": {
"doc_values": true,
"index": "not_analyzed",
"norms": {
"enabled": false
},
"type": "string"
},
"root": {
"doc_values": true,
"index": "not_analyzed",
"norms": {
"enabled": false
},
"type": "string"
},
"root_tokens": {
"doc_values": true,
"index": "not_analyzed",
"norms": {
"enabled": false
},
"type": "string"
}
},
"type": "nested"
},
"text": {
"doc_values": true,
"index": "not_analyzed",
"norms": {
"enabled": false
},
"type": "string"
}
}
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"estnltk_lowercase": {
"filter": [
"lowercase"
],
"tokenizer": "whitespace",
"type": "custom"
},
"estnltk_uppercase": {
"filter": [
"uppercase"
],
"tokenizer": "whitespace",
"type": "custom"
}
}
}
}
}
# This is the mapping file for creating new indexes.
# It is somewhat more verbose than it needs to be, but it is explicit in its choices.
mapping = {
"mappings": {
"document": {
"_all": {
"enabled": False # We rarely want to search over all fields
},
"properties": {
"meta": {
"type": "object"
}
}
},
"flag": {
"_all": {
"enabled": False
},
"_parent": {
"type": "sentence"
},
"properties": {
"value": {
"doc_values": True,
"index": "not_analyzed",
"norms": {
"enabled": False
},
"type": "string"
}
}
},
"sentence": {
"_all": {
"enabled": False
},
"_parent": {
"type": "document"
},
"properties": {
"estnltk_text_object": { # raw estnltk text object
"index": "no", # not searchable
"store": True, # but stored separately
"type": "string" # Not analyzed, stored as text
},
"lemmas": {
"analyzer": "estnltk_lowercase",
"norms": {
"enabled": False
},
"type": "string",
"position_increment_gap": 100 # different analyses are separated by 100 positions
},
"meta": {
"properties": {
"order_in_parent": {
"doc_values": True, # keep less information in memory
"norms": { # do not compute data for scoring results
"enabled": False
},
"type": "long"
}
}
},
"postags": {
"analyzer": "estnltk_uppercase",
"norms": {
"enabled": False
},
"type": "string",
"position_increment_gap": 100
},
"text": {
"analyzer": "whitespace",
"norms": {
"enabled": False
},
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
},
"words": {
"properties": {
"analysis": {
"properties": {
"clitic": {
"doc_values": True,
"index": "not_analyzed",
"norms": {
"enabled": False
},
"type": "string"
},
"ending": {
"doc_values": True,
"index": "not_analyzed",
"norms": {
"enabled": False
},
"type": "string"
},
"form": {
"doc_values": True,
"index": "not_analyzed",
"norms": {
"enabled": False
},
"type": "string"
},
"lemma": {
"doc_values": True,
"index": "not_analyzed",
"norms": {
"enabled": False
},
"type": "string"
},
"partofspeech": {
"doc_values": True,
"index": "not_analyzed",
"norms": {
"enabled": False
},
"type": "string"
},
"root": {
"doc_values": True,
"index": "not_analyzed",
"norms": {
"enabled": False
},
"type": "string"
},
"root_tokens": {
"doc_values": True,
"index": "not_analyzed",
"norms": {
"enabled": False
},
"type": "string"
}
},
"type": "nested"
},
"text": {
"doc_values": True,
"index": "not_analyzed",
"norms": {
"enabled": False
},
"type": "string"
}
}
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"estnltk_lowercase": {
"filter": [
"lowercase"
],
"tokenizer": "whitespace",
"type": "custom"
},
"estnltk_uppercase": {
"filter": [
"uppercase"
],
"tokenizer": "whitespace",
"type": "custom"
}
}
}
}
}
Copyright (c) 2007-2014 Johan Hall, Jens Nilsson and Joakim Nivre
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the distribution.
* Neither the name of MaltParser nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
......@@ -10,29 +10,43 @@ from ..clausesegmenter import ClauseSegmenter
class ClausesTest(unittest.TestCase):
def test_divide_multi(self):
text = Text('Kõrred, millel on toitunud viljasääse vastsed, jäävad õhukeseks.')
segmenter = ClauseSegmenter()
text = Text('Kõrred, millel on toitunud viljasääse vastsed, jäävad õhukeseks.', clause_segmenter = segmenter)
clauses = text.divide('words', 'clauses')
korred, _1, millel, on, toitunud, viljasaase, vastsed, _2, jaavad, ohukeseks, _3 = text.words
self.assertListEqual([korred, jaavad, ohukeseks, _3], clauses[0])
self.assertListEqual([_1, millel, on, toitunud, viljasaase, vastsed, _2], clauses[1])
self.assertEqual(len(clauses), 2)
# Terminate Java process in order to avoid "OSError: [WinError 6] The handle is invalid"
# in subsequent Java processing
segmenter._process.terminate()
def test_split_by_clauses(self):
text = Text('Kõrred, millel on toitunud viljasääse vastsed, jäävad õhukeseks.')
outer = Text('Kõrred jäävad õhukeseks.').tag_clauses()
inner = Text(', millel on toitunud väljasääse vastsed,').tag_clauses()
segmenter = ClauseSegmenter()
text = Text('Kõrred, millel on toitunud viljasääse vastsed, jäävad õhukeseks.', clause_segmenter = segmenter)
outer = Text('Kõrred jäävad õhukeseks.', clause_segmenter = segmenter).tag_clauses()
inner = Text(', millel on toitunud väljasääse vastsed,', clause_segmenter = segmenter).tag_clauses()
outer_split, inner_split = text.split_by('clauses')
self.assertListEqual(inner.word_spans, inner_split.word_spans)
self.assertListEqual(outer.word_spans, outer_split.word_spans)
# Terminate Java process in order to avoid "OSError: [WinError 6] The handle is invalid"
# in subsequent Java processing
segmenter._process.terminate()
def test_ignore_missing_commas_1(self):
segmenter = ClauseSegmenter( ignore_missing_commas=True )
text = Text('Pritsimehed leidsid eest lõõmava kapotialusega auto mida läheduses parkinud masinate sohvrid eemale üritasid lükata kuid esialgu see ei õnnestunud sest autol oli käik sees.', clause_segmenter = segmenter)
clauses = text.divide('words', 'clauses')
self.assertEqual(len(clauses), 4)
# Terminate Java process in order to avoid "OSError: [WinError 6] The handle is invalid"
# in subsequent Java processing
segmenter._process.terminate()
def test_ignore_missing_commas_2(self):
segmenter = ClauseSegmenter( ignore_missing_commas=True )
text = Text('Keegi teine ka siin ju kirjutas et ütles et saab ise asjadele järgi minna aga vastust seepeale ei tulnudki.', clause_segmenter = segmenter)
clauses = text.divide('words', 'clauses')
self.assertEqual(len(clauses), 4)
# Terminate Java process in order to avoid "OSError: [WinError 6] The handle is invalid"
# in subsequent Java processing
segmenter._process.terminate()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment