Commit 2577c429 authored by Andreas Mueller's avatar Andreas Mueller
Browse files

Make things deterministic, add tests for masks.

parent f73deb4a
from wordcloud import WordCloud
import numpy as np
from nose.tools import assert_equal
from nose.tools import assert_equal, assert_greater, assert_true, assert_raises
from numpy.testing import assert_array_equal
from PIL import Image
from tempfile import NamedTemporaryFile
THIS = """The Zen of Python, by Tim Peters
......@@ -28,6 +31,7 @@ Namespaces are one honking great idea -- let's do more of those!
def test_default():
# test that default word cloud creation and conversions work
wc = WordCloud(max_words=50)
wc.generate(THIS)
......@@ -49,13 +53,79 @@ def test_default():
assert_equal(wc_array.shape, (wc.height, wc.width, 3))
def check_errors():
pass
def test_writing_to_file():
wc = WordCloud()
wc.generate(THIS)
# check writing to file
f = NamedTemporaryFile(suffix=".png")
filename = f.name
wc.to_file(filename)
loaded_image = Image.open(filename)
assert_equal(loaded_image.size, (wc.width, wc.height))
def test_check_errors():
wc = WordCloud()
assert_raises(NotImplementedError, wc.to_html)
try:
np.array(wc)
raise AssertionError("np.array(wc) didn't raise")
except ValueError as e:
assert_true("call generate" in str(e))
try:
wc.recolor()
raise AssertionError("wc.recolor didn't raise")
except ValueError as e:
assert_true("call generate" in str(e))
def test_recolor():
pass
wc = WordCloud(max_words=50)
wc.generate(THIS)
array_before = wc.to_array()
wc.recolor()
array_after = wc.to_array()
# check that the same places are filled
assert_array_equal(array_before.sum(axis=-1) != 0,
array_after.sum(axis=-1) != 0)
# check that they are not the same
assert_greater(np.abs(array_before - array_after).sum(), 10000)
def test_random_state():
# check that random state makes everything deterministic
wc = WordCloud(random_state=0)
wc2 = WordCloud(random_state=0)
wc.generate(THIS)
wc2.generate(THIS)
assert_array_equal(wc, wc2)
def test_mask():
# test masks
# check that using an empty mask is equivalent to not using a mask
wc = WordCloud(random_state=42)
wc.generate(THIS)
mask = np.zeros(np.array(wc).shape[:2])
wc_mask = WordCloud(mask=mask, random_state=42)
wc_mask.generate(THIS)
assert_array_equal(wc, wc_mask)
# use actual nonzero mask
mask = np.zeros((234, 456))
mask[100:150, 300:400] = 1
wc = WordCloud(mask=mask)
wc.generate(THIS)
wc_array = np.array(wc)
assert_equal(mask.shape, wc_array.shape[:2])
assert_array_equal(wc_array[mask != 0], 0)
assert_greater(wc_array[mask == 0].sum(), 10000)
def check_parameters():
# check that parameters are actually used
pass
* command line interface
* easy access to image, numpy array
* html export
* good notebook interface
* recoloring support
* by default differnt color schemes
* examples
* unit tests
* website
* docstrings
* deterministic functionality
* long functions?
* no dependency on sklearn any more?
* redo examples
* examples
* filter one-letter words
......@@ -4,7 +4,7 @@ import array
import numpy as np
def query_integral_image(unsigned int[:,:] integral_image, int size_x, int size_y):
def query_integral_image(unsigned int[:,:] integral_image, int size_x, int size_y, random_state):
cdef int x = integral_image.shape[0]
cdef int y = integral_image.shape[1]
cdef int area, i, j
......@@ -21,7 +21,7 @@ def query_integral_image(unsigned int[:,:] integral_image, int size_x, int size_
# no room left
return None
# pick a location at random
cdef int goal = np.random.randint(hits)
cdef int goal = random_state.randint(0, hits)
hits = 0
for i in xrange(x - size_x):
for j in xrange(y - size_y):
......
......@@ -4,7 +4,7 @@
#
# License: MIT
import random
from random import Random
import os
import re
import numpy as np
......@@ -24,7 +24,7 @@ STOPWORDS = set([x.strip() for x in open(os.path.join(os.path.dirname(__file__),
def random_color_func(word, font_size, position, orientation, random_state=None):
if random_state is None:
random_state = random.Random()
random_state = Random()
return "hsl(%d, 80%%, 50%%)" % random_state.randint(0, 255)
......@@ -50,9 +50,10 @@ class WordCloud(object):
The ratio of times to try horizontal fitting as opposed to vertical.
mask : nd-array or None (default=None)
If not None, gives a binary mask on where to draw words. In this case,
width and height will be ignored and the shape of mask will be used
instead.
If not None, gives a binary mask on where to draw words. All zero
entries will be considered "free" to draw on, while all non-zero
entries will be deemed occupied. If mask is not None, width and height will be
ignored and the shape of mask will be used instead.
max_words : number (default=200)
The maximum number of words.
......@@ -72,7 +73,7 @@ class WordCloud(object):
def __init__(self, font_path=None, width=400, height=200, margin=5,
ranks_only=False, prefer_horizontal=0.9, mask=None, scale=1,
color_func=random_color_func, max_words=200, stopwords=None):
color_func=random_color_func, max_words=200, stopwords=None, random_state=None):
if stopwords is None:
stopwords = STOPWORDS
if font_path is None:
......@@ -88,8 +89,11 @@ class WordCloud(object):
self.color_func = color_func
self.max_words = max_words
self.stopwords = stopwords
if isinstance(random_state, int):
random_state = Random(random_state)
self.random_state = random_state
def fit_words(self, words):
def _fit_words(self, words):
"""Generate the positions for words.
Parameters
......@@ -113,6 +117,10 @@ class WordCloud(object):
relative differences don't matter. Play with setting the font_size in the
main loop for different styles.
"""
if self.random_state is not None:
random_state = self.random_state
else:
random_state = Random()
if len(words) <= 0:
print("We need at least 1 word to plot a word cloud, got %d."
......@@ -145,7 +153,7 @@ class WordCloud(object):
# try to find a position
font = ImageFont.truetype(self.font_path, font_size)
# transpose font optionally
if random.random() < self.prefer_horizontal:
if random_state.random() < self.prefer_horizontal:
orientation = None
else:
orientation = Image.ROTATE_90
......@@ -156,7 +164,7 @@ class WordCloud(object):
box_size = draw.textsize(word)
# find possible places using integral image:
result = query_integral_image(integral, box_size[1] + self.margin,
box_size[0] + self.margin)
box_size[0] + self.margin, random_state)
if result is not None or font_size == 0:
break
# if we didn't find a place, make font smaller
......@@ -172,7 +180,8 @@ class WordCloud(object):
positions.append((x, y))
orientations.append(orientation)
font_sizes.append(font_size)
colors.append(self.color_func(word, font_size, (x, y), orientation))
colors.append(self.color_func(word, font_size, (x, y), orientation,
random_state=random_state))
# recompute integral image
if self.mask is None:
img_array = np.asarray(img_grey)
......@@ -198,7 +207,7 @@ class WordCloud(object):
self.layout_ = zip(words, font_sizes, positions, orientations, colors)
return self.layout_
def process_text(self, text):
def _process_text(self, text):
"""Splits a long text into words, eliminates the stopwords.
Parameters
......@@ -265,20 +274,32 @@ class WordCloud(object):
return words
def generate(self, text):
"""Convenience function that calls process_text and fit_words.
"""Generate wordcloud from text.
Calls _process_text and _fit_words.
Returns
-------
self
"""
self.process_text(text)
self.fit_words(self.words_)
self._process_text(text)
self._fit_words(self.words_)
return self
def to_image(self):
def _check_generated(self):
"""Check if layout_ was computed, otherwise raise error."""
if not hasattr(self, "layout_"):
raise ValueError("WordCloud has not been calculated, call generate first.")
img = Image.new("RGB", (self.width * self.scale, self.height * self.scale))
def to_image(self):
self._check_generated()
if self.mask is not None:
width = self.mask.shape[1]
height = self.mask.shape[0]
else:
height, width = self.height, self.width
img = Image.new("RGB", (width * self.scale, height * self.scale))
draw = ImageDraw.Draw(img)
for (word, count), font_size, position, orientation, color in self.layout_:
font = ImageFont.truetype(self.font_path, font_size * self.scale)
......@@ -296,8 +317,9 @@ class WordCloud(object):
Parameters
----------
random_state : RandomState or None, default=None
If not None, a fixed random state is used.
random_state : RandomState, int, or None, default=None
If not None, a fixed random state is used. If an int is given, this
is used as seed for a random.Random state.
color_func : function or None, default=None
Function to generate new color from word count, font size, position
......@@ -307,6 +329,7 @@ class WordCloud(object):
-------
self
"""
self._check_generated()
if color_func is None:
color_func = self.color_func
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment