Commit 6a77586a authored by Karl's avatar Karl
Browse files

Merge remote-tracking branch 'origin/devel' into devel

parents 92af0757 076d2c19
......@@ -20,6 +20,7 @@ import os
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
sys.path.insert(0, '/home/timo/projects/estnltk')
sys.path.insert(0, '/home/keeletehnoloogia/estnltk')
# -- General configuration ------------------------------------------------
......
.. _database_tutorial:
===========================================================
Handling large text collections with ElasticSearch database
===========================================================
=====================================================
Handling large text collections with Elastic database
=====================================================
.. content ..
Mention elasticsearch visualization plugin ES view
The activate Elastic (formerly Elasticsearch) carry out the guide from the Elastic team
at webpage `https://www.elastic.co/guide/en/elasticsearch/reference/current/_installation.html`_.
Mention that simply for testing purposes one can increase memory using --ES_MAX_MEM switch
./elasticsearch --ES_MAX_MEM=4g
.. _https://www.elastic.co/guide/en/elasticsearch/reference/current/_installation.html: https://www.elastic.co/guide/en/elasticsearch/reference/current/_installation.html/
When the installation is complete you can run Elastic (from Elastic folder) with the command::
./elasticsearch
Elastic has a visualization plugin that can be accessed through a browser of your choosing.
To do this you need to write `http://localhost:9200/_plugin/head/`_ to the URL bar in your browser.
.. _http://localhost:9200/_plugin/head/: http://localhost:9200/_plugin/head/
For simple testing purposes one can increase the memory by using --ES_MAX_MEM switch.
Example of using the memory switch::
./elasticsearch --ES_MAX_MEM=4g
Bulk importing data
===================
......@@ -35,3 +49,40 @@ Eesti Koondkorpus, you can insert them using commands::
python3 -m estnltk.database.importer koond corpora/koond
python3 -m estnltk.database.importer eesti corpora/eesti
Insert Text object to database
==============================
Estnltk has a python function for inserting Text objects to Elastic database for further analysis.
It is important that you create a database before inserting. In the example there is a database created named 'test'.
After that the Text object is created with a sentence. Then the insert() function is being called.
Example for using the text insert::
from ..database import Database
from ...text import Text
db = Database('test')
text = Text('Mees, keda seal kohtasime, oli tuttav ja ta teretas meid.')
db.insert(text)
Searching the database for keywords
===================================
To search from the Elastic database you need to specify the name of the database and the keywords that you need
to start the search for. The function to do the search with is query_documents().
The example search is from the 'test' database and the search word is 'aegna'::
from ..database import Database
db = Database('test')
search = Database.query_documents(db, "aegna")
The search will return a json format query with the full text of the successful search result.
\ No newline at end of file
......@@ -31,13 +31,14 @@ class InsertTest(unittest.TestCase):
def test_insert_default_ids(self):
# see pole warningu eemaldamiseks sobiv viis, sest warning lihtsalt peidetakse. Pigem las ta olla nähtav.
# TODO: delete me: warnings.simplefilter("ignore")
self.db.refresh()
db = self.db
# insert the documents
id_first = db.insert(first())
print(id_first)
id_second = db.insert(second())
print(id_second)
# check the count
self.assertEqual(2, db.count())
......@@ -51,19 +52,12 @@ class BulkInsertTest(unittest.TestCase):
def setUp(self):
self.db = Database(TEST_INDEX)
self.db.delete()
#self.db.delete()
def test_bulk_insert(self):
db = self.db
db.refresh()
# parem on tõsta first ja second InsertTestist lihtsalt välja (tegin juba selle ära).
# uue instantsi tegemine on ebavajalik.
# TODO: delete me.
# insert many (bulk) into db bulk_test
# it = InsertTest()
# text_lists = [it.first, it.second]
text_lists = [first(), second()]
id_bulk = db.bulk_insert(text_lists)
......@@ -74,7 +68,6 @@ class BulkInsertTest(unittest.TestCase):
class SearchTest(unittest.TestCase):
def test_search_keyword_documents(self):
# TODO: move Database setup and initialization to def setUp() method
self.db = Database(TEST_INDEX)
keywords = ["aegna"]
search = Database.query_documents(self.db, query=keywords)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment