Commit 5441ae57 authored by Indrek Jentson's avatar Indrek Jentson

Changed regexp, word encoding in order to support Estonian texts.

parent 4fc252d9
......@@ -5,3 +5,6 @@ wordcloud/query_integral_image.so
doc/_build
doc/auto_examples
doc/generated
.project
.pydevproject
......@@ -2,17 +2,17 @@ from setuptools import setup
from setuptools.extension import Extension
setup(
author="Andreas Mueller",
author="Andreas Mueller, modified by Indrek Jentson",
author_email="t3kcit+wordcloud@gmail.com",
name='wordcloud',
version='1.3',
url='https://github.com/amueller/word_cloud',
description='A little word cloud generator',
version='1.3.1',
url='https://gitlab.keeleressursid.ee/keeleliin/word_cloud_est',
description='A little word cloud generator, modified for Estonian',
license='MIT',
install_requires=['matplotlib', 'numpy>=1.6.1', 'pillow'],
install_requires=['matplotlib', 'numpy>=1.12.1', 'pillow'],
ext_modules=[Extension("wordcloud.query_integral_image",
["wordcloud/query_integral_image.c"])],
scripts=['wordcloud/wordcloud_cli.py'],
packages=['wordcloud'],
package_data={'wordcloud': ['stopwords', 'DroidSansMono.ttf']}
package_data={'wordcloud': ['stopwords.et', 'DroidSansMono.ttf']}
)
aasta
aeg
aga
ainult
alla
andma
asi
eest
ega
ehk
ei
enam
enne
erinev
eriti
esimene
et
ette
hakkama
iga
ikka
ise
isegi
ja
ju
juba
just
järgmine
jääma
ka
kaks
kas
keegi
kes
kogu
koht
kohta
kokku
kolm
koos
kord
kui
kuid
kuidas
kuigi
kuna
kuni
kus
kõige
kõik
küll
laps
läbi
mees
miks
mina
minema
mingi
mis
miski
mitte
mitu
muu
mõni
nagu
naine
nii
ning
nägema
nüüd
olema
oma
osa
palju
panema
peale
pidama
pool
poolt
praegu
puhul
päev
pärast
rohkem
saama
sama
samuti
seal
see
selline
sest
siin
siis
siiski
sina
suur
suurem
tagasi
tahtma
teadma
tegema
teine
tema
tulema
uus
vahel
vaid
vastu
veel
viimane
või
võima
võtma
väga
väike
välja
ära
üks
üle
ütlema
......@@ -26,10 +26,11 @@ from .tokenization import unigrams_and_bigrams, process_tokens
item1 = itemgetter(1)
FONT_PATH = os.environ.get("FONT_PATH", os.path.join(os.path.dirname(__file__),
"DroidSansMono.ttf"))
FONT_PATH = os.environ.get("FONT_PATH", os.path.join(os.path.dirname(__file__), "DroidSansMono.ttf"))
# FONT_PATH = os.environ.get("FONT_PATH", os.path.join(os.path.dirname(__file__), "RobotoCondensed-Regular.ttf"))
STOPWORDS = set([x.strip() for x in open(
os.path.join(os.path.dirname(__file__), 'stopwords')).read().split('\n')])
os.path.join(os.path.dirname(__file__), 'stopwords.et')).read().split('\n')])
class IntegralOccupancyMap(object):
......@@ -434,7 +435,7 @@ class WordCloud(object):
transposed_font = ImageFont.TransposedFont(
font, orientation=orientation)
# get size of resulting text
box_size = draw.textsize(word, font=transposed_font)
box_size = draw.textsize(word.decode('utf8', 'ignore'), font=transposed_font)
# find possible places using integral image:
result = occupancy.sample_position(box_size[1] + self.margin,
box_size[0] + self.margin,
......@@ -458,7 +459,7 @@ class WordCloud(object):
x, y = np.array(result) + self.margin // 2
# actually draw the text
draw.text((y, x), word, fill="white", font=transposed_font)
draw.text((y, x), word.decode('utf8', 'ignore'), fill="white", font=transposed_font)
positions.append((x, y))
orientations.append(orientation)
font_sizes.append(font_size)
......@@ -507,14 +508,14 @@ class WordCloud(object):
flags = (re.UNICODE if sys.version < '3' and type(text) is unicode
else 0)
regexp = self.regexp if self.regexp is not None else r"\w[\w']+"
regexp = self.regexp if self.regexp is not None else r"[A-z,ŠŽÕÄÖÜšžõäöü]{2,}"
words = re.findall(regexp, text, flags)
# remove stopwords
words = [word for word in words if word.lower() not in stopwords]
# remove 's
words = [word[:-2] if word.lower().endswith("'s") else word
for word in words]
# words = [word[:-2] if word.lower().endswith("'s") else word
# for word in words]
# remove numbers
words = [word for word in words if not word.isdigit()]
......@@ -580,7 +581,7 @@ class WordCloud(object):
font, orientation=orientation)
pos = (int(position[1] * self.scale),
int(position[0] * self.scale))
draw.text(pos, word, fill=color, font=transposed_font)
draw.text(pos, word.decode('utf8', 'ignore') , fill=color, font=transposed_font)
return img
def recolor(self, random_state=None, color_func=None, colormap=None):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment