Commit b648b958 authored by Andreas Mueller's avatar Andreas Mueller Committed by GitHub

Refactor tokenization, use collocations by default (#178)

* f...ine pep8 changes

* refactor tokenization, use collocations by default.

* trying to refactor sensibly

* return standard cases for each word, disambiguate for bigrams

* add simple smoke tests for the no-collocations path
parent c6d022df
......@@ -31,6 +31,16 @@ Namespaces are one honking great idea -- let's do more of those!
"""
def test_collocations():
wc = WordCloud(collocations=False)
wc.generate(THIS)
wc2 = WordCloud(collocations=True)
wc2.generate(THIS)
assert_greater(len(wc2.words_), len(wc.words_))
def test_default():
# test that default word cloud creation and conversions work
wc = WordCloud(max_words=50)
......@@ -187,8 +197,3 @@ def test_generate_from_frequencies():
result = wc.generate_from_frequencies(items)
assert_true(isinstance(result, WordCloud))
def check_parameters():
# check that parameters are actually used
pass
from itertools import tee
from operator import itemgetter
from collections import defaultdict
from math import log
def l(k, n, x):
# dunning's likelihood ratio with notation from
# http://nlp.stanford.edu/fsnlp/promo/colloc.pdf
return log(max(x, 1e-10)) * k + log(max(1 - x, 1e-10)) * (n - k)
def score(count_bigram, count1, count2, n_words):
"""Collocation score"""
N = n_words
c12 = count_bigram
c1 = count1
c2 = count2
p = c2 / N
p1 = c12 / c1
p2 = (c2 - c12) / (N - c1)
score = (l(c12, c1, p) + l(c2 - c12, N - c1, p)
- l(c12, c1, p1) - l(c2 - c12, N - c1, p2))
return -2 * score
def pairwise(iterable):
# from itertool recipies
# is -> (s0,s1), (s1,s2), (s2, s3), ...
a, b = tee(iterable)
next(b, None)
return zip(a, b)
def unigrams_and_bigrams(words):
n_words = len(words)
# make tuples of two words following each other
bigrams = list(pairwise(words))
counts_unigrams = defaultdict(int)
counts_bigrams = defaultdict(int)
counts_unigrams, standard_form = process_tokens(words)
counts_bigrams, standard_form_bigrams = process_tokens(
[" ".join(bigram) for bigram in bigrams])
# create a copy of counts_unigram so the score computation is not changed
counts = counts_unigrams.copy()
# decount words inside bigrams
for bigram_string, count in counts_bigrams.items():
bigram = tuple(bigram_string.split(" "))
# collocation detection (30 is arbitrary):
word1 = standard_form[bigram[0].lower()]
word2 = standard_form[bigram[1].lower()]
if score(count, counts[word1], counts[word2], n_words) > 30:
counts_unigrams[word1] -= counts_bigrams[bigram_string]
counts_unigrams[word2] -= counts_bigrams[bigram_string]
# add joined bigram into unigrams
counts_unigrams[bigram_string] = counts_bigrams[bigram_string]
return counts_unigrams
def process_tokens(words):
"""Normalize cases and remove plurals.
Each word is represented by the most common case.
If a word appears with an "s" on the end and without an "s" on the end,
the version with "s" is assumed to be a plural and merged with the
version without "s".
Parameters
----------
words : iterable of strings
Words to count.
Returns
-------
counts : dict from string to int
Counts for each unique word, with cases represented by the most common
case, and plurals removed.
standard_forms : dict from string to string
For each lower-case word the standard capitalization.
"""
# words can be either a list of unigrams or bigrams
# d is a dict of dicts.
# Keys of d are word.lower(). Values are dicts
# counting frequency of each capitalization
d = defaultdict(dict)
for word in words:
if word.isdigit():
continue
word_lower = word.lower()
# get dict of cases for word_lower
case_dict = d[word_lower]
# increase this case
case_dict[word] = case_dict.get(word, 0) + 1
# merge plurals into the singular count (simple cases only)
for key in list(d.keys()):
if key.endswith('s'):
key_singular = key[:-1]
if key_singular in d:
dict_plural = d[key]
dict_singular = d[key_singular]
for word, count in dict_plural.items():
singular = word[:-1]
dict_singular[singular] = (dict_singular.get(singular, 0)
+ count)
del d[key]
fused_cases = {}
standard_cases = {}
item1 = itemgetter(1)
for word_lower, case_dict in d.items():
# Get the most popular case.
first = max(case_dict.items(), key=item1)[0]
fused_cases[first] = sum(case_dict.values())
standard_cases[word_lower] = first
return fused_cases, standard_cases
......@@ -20,13 +20,14 @@ from PIL import ImageDraw
from PIL import ImageFont
from .query_integral_image import query_integral_image
from .tokenization import unigrams_and_bigrams, process_tokens
item1 = itemgetter(1)
FONT_PATH = os.environ.get("FONT_PATH", os.path.join(os.path.dirname(__file__),
"DroidSansMono.ttf"))
STOPWORDS = set([x.strip() for x in open(os.path.join(os.path.dirname(__file__),
'stopwords')).read().split('\n')])
STOPWORDS = set([x.strip() for x in open(
os.path.join(os.path.dirname(__file__), 'stopwords')).read().split('\n')])
class IntegralOccupancyMap(object):
......@@ -41,11 +42,12 @@ class IntegralOccupancyMap(object):
self.integral = np.zeros((height, width), dtype=np.uint32)
def sample_position(self, size_x, size_y, random_state):
return query_integral_image(self.integral, size_x, size_y, random_state)
return query_integral_image(self.integral, size_x, size_y,
random_state)
def update(self, img_array, pos_x, pos_y):
partial_integral = np.cumsum(np.cumsum(img_array[pos_x:, pos_y:], axis=1),
axis=0)
partial_integral = np.cumsum(np.cumsum(img_array[pos_x:, pos_y:],
axis=1), axis=0)
# paste recomputed part into old image
# if x or y is zero it is a bit annoying
if pos_x > 0:
......@@ -72,7 +74,8 @@ def random_color_func(word=None, font_size=None, position=None,
word, font_size, position, orientation : ignored.
random_state : random.Random object or None, (default=None)
If a random object is given, this is used for generating random numbers.
If a random object is given, this is used for generating random
numbers.
"""
if random_state is None:
......@@ -82,14 +85,16 @@ def random_color_func(word=None, font_size=None, position=None,
def get_single_color_func(color):
"""Create a color function which returns a single hue and saturation with.
different values (HSV). Accepted values are color strings as usable by PIL/Pillow.
different values (HSV). Accepted values are color strings as usable by
PIL/Pillow.
>>> color_func1 = get_single_color_func('deepskyblue')
>>> color_func2 = get_single_color_func('#00b4d2')
"""
old_r, old_g, old_b = ImageColor.getrgb(color)
rgb_max = 255.
h, s, v = colorsys.rgb_to_hsv(old_r / rgb_max, old_g / rgb_max, old_b / rgb_max)
h, s, v = colorsys.rgb_to_hsv(old_r / rgb_max, old_g / rgb_max,
old_b / rgb_max)
def single_color_func(word=None, font_size=None, position=None,
orientation=None, font_path=None, random_state=None):
......@@ -103,13 +108,15 @@ def get_single_color_func(color):
word, font_size, position, orientation : ignored.
random_state : random.Random object or None, (default=None)
If a random object is given, this is used for generating random numbers.
If a random object is given, this is used for generating random
numbers.
"""
if random_state is None:
random_state = Random()
r, g, b = colorsys.hsv_to_rgb(h, s, random_state.uniform(0.2, 1))
return 'rgb({:.0f}, {:.0f}, {:.0f})'.format(r * rgb_max, g * rgb_max, b * rgb_max)
return 'rgb({:.0f}, {:.0f}, {:.0f})'.format(r * rgb_max, g * rgb_max,
b * rgb_max)
return single_color_func
......@@ -169,17 +176,23 @@ class WordCloud(object):
Transparent background will be generated when mode is "RGBA" and
background_color is None.
relative_scaling : float (default=0)
Importance of relative word frequencies for font-size.
With relative_scaling=0, only word-ranks are considered.
With relative_scaling=1, a word that is twice as frequent will have twice the size.
If you want to consider the word frequencies and not only their rank, relative_scaling
around .5 often looks good.
relative_scaling : float (default=.5)
Importance of relative word frequencies for font-size. With
relative_scaling=0, only word-ranks are considered. With
relative_scaling=1, a word that is twice as frequent will have twice
the size. If you want to consider the word frequencies and not only
their rank, relative_scaling around .5 often looks good.
.. versionchanged: 2.0
Default is now 0.5.
regexp : string or None (optional)
Regular expression to split the input text into tokens in process_text.
If None is specified, ``r"\w[\w']+"`` is used.
collocations : bool, default=True
Whether to include collocations (bigrams) of two words.
Attributes
----------
``words_``: list of tuples (string, float)
......@@ -191,8 +204,8 @@ class WordCloud(object):
Notes
-----
Larger canvases with make the code significantly slower. If you need a large
word cloud, try a lower canvas size, and set the scale parameter.
Larger canvases with make the code significantly slower. If you need a
large word cloud, try a lower canvas size, and set the scale parameter.
The algorithm might give more weight to the ranking of the words
than their actual frequencies, depending on the ``max_font_size`` and the
......@@ -203,9 +216,11 @@ class WordCloud(object):
ranks_only=None, prefer_horizontal=0.9, mask=None, scale=1,
color_func=random_color_func, max_words=200, min_font_size=4,
stopwords=None, random_state=None, background_color='black',
max_font_size=None, font_step=1, mode="RGB", relative_scaling=0, regexp=None):
max_font_size=None, font_step=1, mode="RGB",
relative_scaling=.5, regexp=None, collocations=True):
if font_path is None:
font_path = FONT_PATH
self.collocations = collocations
self.font_path = font_path
self.width = width
self.height = height
......@@ -228,12 +243,13 @@ class WordCloud(object):
self.max_font_size = max_font_size
self.mode = mode
if relative_scaling < 0 or relative_scaling > 1:
raise ValueError("relative_scaling needs to be between 0 and 1, got %f."
% relative_scaling)
raise ValueError("relative_scaling needs to be "
"between 0 and 1, got %f." % relative_scaling)
self.relative_scaling = relative_scaling
if ranks_only is not None:
warnings.warn("ranks_only is deprecated and will be removed as"
" it had no effect. Look into relative_scaling.", DeprecationWarning)
" it had no effect. Look into relative_scaling.",
DeprecationWarning)
def fit_words(self, frequencies):
"""Create a word_cloud from words and frequencies.
......@@ -270,7 +286,8 @@ class WordCloud(object):
# largest entry will be 1
max_frequency = float(frequencies[0][1])
frequencies = [(word, freq / max_frequency) for word, freq in frequencies]
frequencies = [(word, freq / max_frequency)
for word, freq in frequencies]
self.words_ = frequencies
......@@ -288,15 +305,16 @@ class WordCloud(object):
width = mask.shape[1]
height = mask.shape[0]
if mask.dtype.kind == 'f':
warnings.warn("mask image should be unsigned byte between 0 and"
" 255. Got a float array")
warnings.warn("mask image should be unsigned byte between 0"
" and 255. Got a float array")
if mask.ndim == 2:
boolean_mask = mask == 255
elif mask.ndim == 3:
# if all channels are white, mask out
boolean_mask = np.all(mask[:, :, :3] == 255, axis=-1)
else:
raise ValueError("Got mask of invalid shape: %s" % str(mask.shape))
raise ValueError("Got mask of invalid shape: %s"
% str(mask.shape))
else:
boolean_mask = None
height, width = self.height, self.width
......@@ -316,7 +334,8 @@ class WordCloud(object):
# select the font size
rs = self.relative_scaling
if rs != 0:
font_size = int(round((rs * (freq / float(last_freq)) + (1 - rs)) * font_size))
font_size = int(round((rs * (freq / float(last_freq))
+ (1 - rs)) * font_size))
while True:
# try to find a position
font = ImageFont.truetype(self.font_path, font_size)
......@@ -325,8 +344,8 @@ class WordCloud(object):
orientation = None
else:
orientation = Image.ROTATE_90
transposed_font = ImageFont.TransposedFont(font,
orientation=orientation)
transposed_font = ImageFont.TransposedFont(
font, orientation=orientation)
# get size of resulting text
box_size = draw.textsize(word, font=transposed_font)
# find possible places using integral image:
......@@ -363,7 +382,8 @@ class WordCloud(object):
occupancy.update(img_array, x, y)
last_freq = freq
self.layout_ = list(zip(frequencies, font_sizes, positions, orientations, colors))
self.layout_ = list(zip(frequencies, font_sizes, positions,
orientations, colors))
return self
def process_text(self, text):
......@@ -388,49 +408,25 @@ class WordCloud(object):
include all those things.
"""
self.stopwords_lower_ = set(map(str.lower, self.stopwords))
stopwords = set(map(str.lower, self.stopwords))
d = {}
flags = (re.UNICODE if sys.version < '3' and type(text) is unicode
else 0)
regexp = self.regexp if self.regexp is not None else r"\w[\w']+"
for word in re.findall(regexp, text, flags=flags):
if word.isdigit():
continue
word_lower = word.lower()
if word_lower in self.stopwords_lower_:
continue
# Look in lowercase dict.
try:
d2 = d[word_lower]
except KeyError:
d2 = {}
d[word_lower] = d2
# Look in any case dict.
d2[word] = d2.get(word, 0) + 1
# merge plurals into the singular count (simple cases only)
for key in list(d.keys()):
if key.endswith('s'):
key_singular = key[:-1]
if key_singular in d:
dict_plural = d[key]
dict_singular = d[key_singular]
for word, count in dict_plural.items():
singular = word[:-1]
dict_singular[singular] = dict_singular.get(singular, 0) + count
del d[key]
d3 = {}
for d2 in d.values():
# Get the most popular case.
first = max(d2.items(), key=item1)[0]
d3[first] = sum(d2.values())
return d3
words = re.findall(regexp, text, flags)
# remove stopwords
words = [word for word in words if word.lower() not in stopwords]
# remove 's
words = [word[:-2] if word.lower().endswith("'s") else word
for word in words]
if self.collocations:
word_counts = unigrams_and_bigrams(words)
else:
word_counts, _ = process_tokens(words)
return word_counts
def generate_from_text(self, text):
"""Generate wordcloud from text.
......@@ -465,7 +461,8 @@ class WordCloud(object):
def _check_generated(self):
"""Check if ``layout_`` was computed, otherwise raise error."""
if not hasattr(self, "layout_"):
raise ValueError("WordCloud has not been calculated, call generate first.")
raise ValueError("WordCloud has not been calculated, call generate"
" first.")
def to_image(self):
self._check_generated()
......@@ -475,21 +472,25 @@ class WordCloud(object):
else:
height, width = self.height, self.width
img = Image.new(self.mode, (int(width * self.scale), int(height * self.scale)),
img = Image.new(self.mode, (int(width * self.scale),
int(height * self.scale)),
self.background_color)
draw = ImageDraw.Draw(img)
for (word, count), font_size, position, orientation, color in self.layout_:
font = ImageFont.truetype(self.font_path, int(font_size * self.scale))
transposed_font = ImageFont.TransposedFont(font,
orientation=orientation)
pos = (int(position[1] * self.scale), int(position[0] * self.scale))
font = ImageFont.truetype(self.font_path,
int(font_size * self.scale))
transposed_font = ImageFont.TransposedFont(
font, orientation=orientation)
pos = (int(position[1] * self.scale),
int(position[0] * self.scale))
draw.text(pos, word, fill=color, font=transposed_font)
return img
def recolor(self, random_state=None, color_func=None):
"""Recolor existing layout.
Applying a new coloring is much faster than generating the whole wordcloud.
Applying a new coloring is much faster than generating the whole
wordcloud.
Parameters
----------
......@@ -514,8 +515,10 @@ class WordCloud(object):
self.layout_ = [(word_freq, font_size, position, orientation,
color_func(word=word_freq[0], font_size=font_size,
position=position, orientation=orientation,
random_state=random_state, font_path=self.font_path))
for word_freq, font_size, position, orientation, _ in self.layout_]
random_state=random_state,
font_path=self.font_path))
for word_freq, font_size, position, orientation, _
in self.layout_]
return self
def to_file(self, filename):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment