Commit b648b958 authored by Andreas Mueller's avatar Andreas Mueller Committed by GitHub

Refactor tokenization, use collocations by default (#178)

* f...ine pep8 changes

* refactor tokenization, use collocations by default.

* trying to refactor sensibly

* return standard cases for each word, disambiguate for bigrams

* add simple smoke tests for the no-collocations path
parent c6d022df
...@@ -31,6 +31,16 @@ Namespaces are one honking great idea -- let's do more of those! ...@@ -31,6 +31,16 @@ Namespaces are one honking great idea -- let's do more of those!
""" """
def test_collocations():
wc = WordCloud(collocations=False)
wc.generate(THIS)
wc2 = WordCloud(collocations=True)
wc2.generate(THIS)
assert_greater(len(wc2.words_), len(wc.words_))
def test_default(): def test_default():
# test that default word cloud creation and conversions work # test that default word cloud creation and conversions work
wc = WordCloud(max_words=50) wc = WordCloud(max_words=50)
...@@ -187,8 +197,3 @@ def test_generate_from_frequencies(): ...@@ -187,8 +197,3 @@ def test_generate_from_frequencies():
result = wc.generate_from_frequencies(items) result = wc.generate_from_frequencies(items)
assert_true(isinstance(result, WordCloud)) assert_true(isinstance(result, WordCloud))
def check_parameters():
# check that parameters are actually used
pass
from itertools import tee
from operator import itemgetter
from collections import defaultdict
from math import log
def l(k, n, x):
# dunning's likelihood ratio with notation from
# http://nlp.stanford.edu/fsnlp/promo/colloc.pdf
return log(max(x, 1e-10)) * k + log(max(1 - x, 1e-10)) * (n - k)
def score(count_bigram, count1, count2, n_words):
"""Collocation score"""
N = n_words
c12 = count_bigram
c1 = count1
c2 = count2
p = c2 / N
p1 = c12 / c1
p2 = (c2 - c12) / (N - c1)
score = (l(c12, c1, p) + l(c2 - c12, N - c1, p)
- l(c12, c1, p1) - l(c2 - c12, N - c1, p2))
return -2 * score
def pairwise(iterable):
# from itertool recipies
# is -> (s0,s1), (s1,s2), (s2, s3), ...
a, b = tee(iterable)
next(b, None)
return zip(a, b)
def unigrams_and_bigrams(words):
n_words = len(words)
# make tuples of two words following each other
bigrams = list(pairwise(words))
counts_unigrams = defaultdict(int)
counts_bigrams = defaultdict(int)
counts_unigrams, standard_form = process_tokens(words)
counts_bigrams, standard_form_bigrams = process_tokens(
[" ".join(bigram) for bigram in bigrams])
# create a copy of counts_unigram so the score computation is not changed
counts = counts_unigrams.copy()
# decount words inside bigrams
for bigram_string, count in counts_bigrams.items():
bigram = tuple(bigram_string.split(" "))
# collocation detection (30 is arbitrary):
word1 = standard_form[bigram[0].lower()]
word2 = standard_form[bigram[1].lower()]
if score(count, counts[word1], counts[word2], n_words) > 30:
counts_unigrams[word1] -= counts_bigrams[bigram_string]
counts_unigrams[word2] -= counts_bigrams[bigram_string]
# add joined bigram into unigrams
counts_unigrams[bigram_string] = counts_bigrams[bigram_string]
return counts_unigrams
def process_tokens(words):
"""Normalize cases and remove plurals.
Each word is represented by the most common case.
If a word appears with an "s" on the end and without an "s" on the end,
the version with "s" is assumed to be a plural and merged with the
version without "s".
Parameters
----------
words : iterable of strings
Words to count.
Returns
-------
counts : dict from string to int
Counts for each unique word, with cases represented by the most common
case, and plurals removed.
standard_forms : dict from string to string
For each lower-case word the standard capitalization.
"""
# words can be either a list of unigrams or bigrams
# d is a dict of dicts.
# Keys of d are word.lower(). Values are dicts
# counting frequency of each capitalization
d = defaultdict(dict)
for word in words:
if word.isdigit():
continue
word_lower = word.lower()
# get dict of cases for word_lower
case_dict = d[word_lower]
# increase this case
case_dict[word] = case_dict.get(word, 0) + 1
# merge plurals into the singular count (simple cases only)
for key in list(d.keys()):
if key.endswith('s'):
key_singular = key[:-1]
if key_singular in d:
dict_plural = d[key]
dict_singular = d[key_singular]
for word, count in dict_plural.items():
singular = word[:-1]
dict_singular[singular] = (dict_singular.get(singular, 0)
+ count)
del d[key]
fused_cases = {}
standard_cases = {}
item1 = itemgetter(1)
for word_lower, case_dict in d.items():
# Get the most popular case.
first = max(case_dict.items(), key=item1)[0]
fused_cases[first] = sum(case_dict.values())
standard_cases[word_lower] = first
return fused_cases, standard_cases
...@@ -20,13 +20,14 @@ from PIL import ImageDraw ...@@ -20,13 +20,14 @@ from PIL import ImageDraw
from PIL import ImageFont from PIL import ImageFont
from .query_integral_image import query_integral_image from .query_integral_image import query_integral_image
from .tokenization import unigrams_and_bigrams, process_tokens
item1 = itemgetter(1) item1 = itemgetter(1)
FONT_PATH = os.environ.get("FONT_PATH", os.path.join(os.path.dirname(__file__), FONT_PATH = os.environ.get("FONT_PATH", os.path.join(os.path.dirname(__file__),
"DroidSansMono.ttf")) "DroidSansMono.ttf"))
STOPWORDS = set([x.strip() for x in open(os.path.join(os.path.dirname(__file__), STOPWORDS = set([x.strip() for x in open(
'stopwords')).read().split('\n')]) os.path.join(os.path.dirname(__file__), 'stopwords')).read().split('\n')])
class IntegralOccupancyMap(object): class IntegralOccupancyMap(object):
...@@ -41,11 +42,12 @@ class IntegralOccupancyMap(object): ...@@ -41,11 +42,12 @@ class IntegralOccupancyMap(object):
self.integral = np.zeros((height, width), dtype=np.uint32) self.integral = np.zeros((height, width), dtype=np.uint32)
def sample_position(self, size_x, size_y, random_state): def sample_position(self, size_x, size_y, random_state):
return query_integral_image(self.integral, size_x, size_y, random_state) return query_integral_image(self.integral, size_x, size_y,
random_state)
def update(self, img_array, pos_x, pos_y): def update(self, img_array, pos_x, pos_y):
partial_integral = np.cumsum(np.cumsum(img_array[pos_x:, pos_y:], axis=1), partial_integral = np.cumsum(np.cumsum(img_array[pos_x:, pos_y:],
axis=0) axis=1), axis=0)
# paste recomputed part into old image # paste recomputed part into old image
# if x or y is zero it is a bit annoying # if x or y is zero it is a bit annoying
if pos_x > 0: if pos_x > 0:
...@@ -72,7 +74,8 @@ def random_color_func(word=None, font_size=None, position=None, ...@@ -72,7 +74,8 @@ def random_color_func(word=None, font_size=None, position=None,
word, font_size, position, orientation : ignored. word, font_size, position, orientation : ignored.
random_state : random.Random object or None, (default=None) random_state : random.Random object or None, (default=None)
If a random object is given, this is used for generating random numbers. If a random object is given, this is used for generating random
numbers.
""" """
if random_state is None: if random_state is None:
...@@ -82,14 +85,16 @@ def random_color_func(word=None, font_size=None, position=None, ...@@ -82,14 +85,16 @@ def random_color_func(word=None, font_size=None, position=None,
def get_single_color_func(color): def get_single_color_func(color):
"""Create a color function which returns a single hue and saturation with. """Create a color function which returns a single hue and saturation with.
different values (HSV). Accepted values are color strings as usable by PIL/Pillow. different values (HSV). Accepted values are color strings as usable by
PIL/Pillow.
>>> color_func1 = get_single_color_func('deepskyblue') >>> color_func1 = get_single_color_func('deepskyblue')
>>> color_func2 = get_single_color_func('#00b4d2') >>> color_func2 = get_single_color_func('#00b4d2')
""" """
old_r, old_g, old_b = ImageColor.getrgb(color) old_r, old_g, old_b = ImageColor.getrgb(color)
rgb_max = 255. rgb_max = 255.
h, s, v = colorsys.rgb_to_hsv(old_r / rgb_max, old_g / rgb_max, old_b / rgb_max) h, s, v = colorsys.rgb_to_hsv(old_r / rgb_max, old_g / rgb_max,
old_b / rgb_max)
def single_color_func(word=None, font_size=None, position=None, def single_color_func(word=None, font_size=None, position=None,
orientation=None, font_path=None, random_state=None): orientation=None, font_path=None, random_state=None):
...@@ -103,13 +108,15 @@ def get_single_color_func(color): ...@@ -103,13 +108,15 @@ def get_single_color_func(color):
word, font_size, position, orientation : ignored. word, font_size, position, orientation : ignored.
random_state : random.Random object or None, (default=None) random_state : random.Random object or None, (default=None)
If a random object is given, this is used for generating random numbers. If a random object is given, this is used for generating random
numbers.
""" """
if random_state is None: if random_state is None:
random_state = Random() random_state = Random()
r, g, b = colorsys.hsv_to_rgb(h, s, random_state.uniform(0.2, 1)) r, g, b = colorsys.hsv_to_rgb(h, s, random_state.uniform(0.2, 1))
return 'rgb({:.0f}, {:.0f}, {:.0f})'.format(r * rgb_max, g * rgb_max, b * rgb_max) return 'rgb({:.0f}, {:.0f}, {:.0f})'.format(r * rgb_max, g * rgb_max,
b * rgb_max)
return single_color_func return single_color_func
...@@ -169,17 +176,23 @@ class WordCloud(object): ...@@ -169,17 +176,23 @@ class WordCloud(object):
Transparent background will be generated when mode is "RGBA" and Transparent background will be generated when mode is "RGBA" and
background_color is None. background_color is None.
relative_scaling : float (default=0) relative_scaling : float (default=.5)
Importance of relative word frequencies for font-size. Importance of relative word frequencies for font-size. With
With relative_scaling=0, only word-ranks are considered. relative_scaling=0, only word-ranks are considered. With
With relative_scaling=1, a word that is twice as frequent will have twice the size. relative_scaling=1, a word that is twice as frequent will have twice
If you want to consider the word frequencies and not only their rank, relative_scaling the size. If you want to consider the word frequencies and not only
around .5 often looks good. their rank, relative_scaling around .5 often looks good.
.. versionchanged: 2.0
Default is now 0.5.
regexp : string or None (optional) regexp : string or None (optional)
Regular expression to split the input text into tokens in process_text. Regular expression to split the input text into tokens in process_text.
If None is specified, ``r"\w[\w']+"`` is used. If None is specified, ``r"\w[\w']+"`` is used.
collocations : bool, default=True
Whether to include collocations (bigrams) of two words.
Attributes Attributes
---------- ----------
``words_``: list of tuples (string, float) ``words_``: list of tuples (string, float)
...@@ -191,8 +204,8 @@ class WordCloud(object): ...@@ -191,8 +204,8 @@ class WordCloud(object):
Notes Notes
----- -----
Larger canvases with make the code significantly slower. If you need a large Larger canvases with make the code significantly slower. If you need a
word cloud, try a lower canvas size, and set the scale parameter. large word cloud, try a lower canvas size, and set the scale parameter.
The algorithm might give more weight to the ranking of the words The algorithm might give more weight to the ranking of the words
than their actual frequencies, depending on the ``max_font_size`` and the than their actual frequencies, depending on the ``max_font_size`` and the
...@@ -203,9 +216,11 @@ class WordCloud(object): ...@@ -203,9 +216,11 @@ class WordCloud(object):
ranks_only=None, prefer_horizontal=0.9, mask=None, scale=1, ranks_only=None, prefer_horizontal=0.9, mask=None, scale=1,
color_func=random_color_func, max_words=200, min_font_size=4, color_func=random_color_func, max_words=200, min_font_size=4,
stopwords=None, random_state=None, background_color='black', stopwords=None, random_state=None, background_color='black',
max_font_size=None, font_step=1, mode="RGB", relative_scaling=0, regexp=None): max_font_size=None, font_step=1, mode="RGB",
relative_scaling=.5, regexp=None, collocations=True):
if font_path is None: if font_path is None:
font_path = FONT_PATH font_path = FONT_PATH
self.collocations = collocations
self.font_path = font_path self.font_path = font_path
self.width = width self.width = width
self.height = height self.height = height
...@@ -228,12 +243,13 @@ class WordCloud(object): ...@@ -228,12 +243,13 @@ class WordCloud(object):
self.max_font_size = max_font_size self.max_font_size = max_font_size
self.mode = mode self.mode = mode
if relative_scaling < 0 or relative_scaling > 1: if relative_scaling < 0 or relative_scaling > 1:
raise ValueError("relative_scaling needs to be between 0 and 1, got %f." raise ValueError("relative_scaling needs to be "
% relative_scaling) "between 0 and 1, got %f." % relative_scaling)
self.relative_scaling = relative_scaling self.relative_scaling = relative_scaling
if ranks_only is not None: if ranks_only is not None:
warnings.warn("ranks_only is deprecated and will be removed as" warnings.warn("ranks_only is deprecated and will be removed as"
" it had no effect. Look into relative_scaling.", DeprecationWarning) " it had no effect. Look into relative_scaling.",
DeprecationWarning)
def fit_words(self, frequencies): def fit_words(self, frequencies):
"""Create a word_cloud from words and frequencies. """Create a word_cloud from words and frequencies.
...@@ -270,7 +286,8 @@ class WordCloud(object): ...@@ -270,7 +286,8 @@ class WordCloud(object):
# largest entry will be 1 # largest entry will be 1
max_frequency = float(frequencies[0][1]) max_frequency = float(frequencies[0][1])
frequencies = [(word, freq / max_frequency) for word, freq in frequencies] frequencies = [(word, freq / max_frequency)
for word, freq in frequencies]
self.words_ = frequencies self.words_ = frequencies
...@@ -288,15 +305,16 @@ class WordCloud(object): ...@@ -288,15 +305,16 @@ class WordCloud(object):
width = mask.shape[1] width = mask.shape[1]
height = mask.shape[0] height = mask.shape[0]
if mask.dtype.kind == 'f': if mask.dtype.kind == 'f':
warnings.warn("mask image should be unsigned byte between 0 and" warnings.warn("mask image should be unsigned byte between 0"
" 255. Got a float array") " and 255. Got a float array")
if mask.ndim == 2: if mask.ndim == 2:
boolean_mask = mask == 255 boolean_mask = mask == 255
elif mask.ndim == 3: elif mask.ndim == 3:
# if all channels are white, mask out # if all channels are white, mask out
boolean_mask = np.all(mask[:, :, :3] == 255, axis=-1) boolean_mask = np.all(mask[:, :, :3] == 255, axis=-1)
else: else:
raise ValueError("Got mask of invalid shape: %s" % str(mask.shape)) raise ValueError("Got mask of invalid shape: %s"
% str(mask.shape))
else: else:
boolean_mask = None boolean_mask = None
height, width = self.height, self.width height, width = self.height, self.width
...@@ -316,7 +334,8 @@ class WordCloud(object): ...@@ -316,7 +334,8 @@ class WordCloud(object):
# select the font size # select the font size
rs = self.relative_scaling rs = self.relative_scaling
if rs != 0: if rs != 0:
font_size = int(round((rs * (freq / float(last_freq)) + (1 - rs)) * font_size)) font_size = int(round((rs * (freq / float(last_freq))
+ (1 - rs)) * font_size))
while True: while True:
# try to find a position # try to find a position
font = ImageFont.truetype(self.font_path, font_size) font = ImageFont.truetype(self.font_path, font_size)
...@@ -325,8 +344,8 @@ class WordCloud(object): ...@@ -325,8 +344,8 @@ class WordCloud(object):
orientation = None orientation = None
else: else:
orientation = Image.ROTATE_90 orientation = Image.ROTATE_90
transposed_font = ImageFont.TransposedFont(font, transposed_font = ImageFont.TransposedFont(
orientation=orientation) font, orientation=orientation)
# get size of resulting text # get size of resulting text
box_size = draw.textsize(word, font=transposed_font) box_size = draw.textsize(word, font=transposed_font)
# find possible places using integral image: # find possible places using integral image:
...@@ -363,7 +382,8 @@ class WordCloud(object): ...@@ -363,7 +382,8 @@ class WordCloud(object):
occupancy.update(img_array, x, y) occupancy.update(img_array, x, y)
last_freq = freq last_freq = freq
self.layout_ = list(zip(frequencies, font_sizes, positions, orientations, colors)) self.layout_ = list(zip(frequencies, font_sizes, positions,
orientations, colors))
return self return self
def process_text(self, text): def process_text(self, text):
...@@ -388,49 +408,25 @@ class WordCloud(object): ...@@ -388,49 +408,25 @@ class WordCloud(object):
include all those things. include all those things.
""" """
self.stopwords_lower_ = set(map(str.lower, self.stopwords)) stopwords = set(map(str.lower, self.stopwords))
d = {}
flags = (re.UNICODE if sys.version < '3' and type(text) is unicode flags = (re.UNICODE if sys.version < '3' and type(text) is unicode
else 0) else 0)
regexp = self.regexp if self.regexp is not None else r"\w[\w']+" regexp = self.regexp if self.regexp is not None else r"\w[\w']+"
for word in re.findall(regexp, text, flags=flags):
if word.isdigit(): words = re.findall(regexp, text, flags)
continue # remove stopwords
words = [word for word in words if word.lower() not in stopwords]
word_lower = word.lower() # remove 's
if word_lower in self.stopwords_lower_: words = [word[:-2] if word.lower().endswith("'s") else word
continue for word in words]
# Look in lowercase dict. if self.collocations:
try: word_counts = unigrams_and_bigrams(words)
d2 = d[word_lower] else:
except KeyError: word_counts, _ = process_tokens(words)
d2 = {}
d[word_lower] = d2 return word_counts
# Look in any case dict.
d2[word] = d2.get(word, 0) + 1
# merge plurals into the singular count (simple cases only)
for key in list(d.keys()):
if key.endswith('s'):
key_singular = key[:-1]
if key_singular in d:
dict_plural = d[key]
dict_singular = d[key_singular]
for word, count in dict_plural.items():
singular = word[:-1]
dict_singular[singular] = dict_singular.get(singular, 0) + count
del d[key]
d3 = {}
for d2 in d.values():
# Get the most popular case.
first = max(d2.items(), key=item1)[0]
d3[first] = sum(d2.values())
return d3
def generate_from_text(self, text): def generate_from_text(self, text):
"""Generate wordcloud from text. """Generate wordcloud from text.
...@@ -465,7 +461,8 @@ class WordCloud(object): ...@@ -465,7 +461,8 @@ class WordCloud(object):
def _check_generated(self): def _check_generated(self):
"""Check if ``layout_`` was computed, otherwise raise error.""" """Check if ``layout_`` was computed, otherwise raise error."""
if not hasattr(self, "layout_"): if not hasattr(self, "layout_"):
raise ValueError("WordCloud has not been calculated, call generate first.") raise ValueError("WordCloud has not been calculated, call generate"
" first.")
def to_image(self): def to_image(self):
self._check_generated() self._check_generated()
...@@ -475,21 +472,25 @@ class WordCloud(object): ...@@ -475,21 +472,25 @@ class WordCloud(object):
else: else:
height, width = self.height, self.width height, width = self.height, self.width
img = Image.new(self.mode, (int(width * self.scale), int(height * self.scale)), img = Image.new(self.mode, (int(width * self.scale),
int(height * self.scale)),
self.background_color) self.background_color)
draw = ImageDraw.Draw(img) draw = ImageDraw.Draw(img)
for (word, count), font_size, position, orientation, color in self.layout_: for (word, count), font_size, position, orientation, color in self.layout_:
font = ImageFont.truetype(self.font_path, int(font_size * self.scale)) font = ImageFont.truetype(self.font_path,
transposed_font = ImageFont.TransposedFont(font, int(font_size * self.scale))
orientation=orientation) transposed_font = ImageFont.TransposedFont(
pos = (int(position[1] * self.scale), int(position[0] * self.scale)) font, orientation=orientation)
pos = (int(position[1] * self.scale),
int(position[0] * self.scale))
draw.text(pos, word, fill=color, font=transposed_font) draw.text(pos, word, fill=color, font=transposed_font)
return img return img
def recolor(self, random_state=None, color_func=None): def recolor(self, random_state=None, color_func=None):
"""Recolor existing layout. """Recolor existing layout.
Applying a new coloring is much faster than generating the whole wordcloud. Applying a new coloring is much faster than generating the whole
wordcloud.
Parameters Parameters
---------- ----------
...@@ -514,8 +515,10 @@ class WordCloud(object): ...@@ -514,8 +515,10 @@ class WordCloud(object):
self.layout_ = [(word_freq, font_size, position, orientation, self.layout_ = [(word_freq, font_size, position, orientation,
color_func(word=word_freq[0], font_size=font_size, color_func(word=word_freq[0], font_size=font_size,
position=position, orientation=orientation, position=position, orientation=orientation,
random_state=random_state, font_path=self.font_path)) random_state=random_state,
for word_freq, font_size, position, orientation, _ in self.layout_] font_path=self.font_path))
for word_freq, font_size, position, orientation, _
in self.layout_]
return self return self
def to_file(self, filename): def to_file(self, filename):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment